Module Name:    src
Committed By:   ryo
Date:           Sat Aug 14 17:51:20 UTC 2021

Modified Files:
        src/common/lib/libc/gmon: mcount.c
        src/sys/arch/aarch64/include: cpu.h
        src/sys/arch/alpha/include: cpu.h
        src/sys/arch/arm/include: cpu.h
        src/sys/arch/hppa/include: cpu.h
        src/sys/arch/mips/include: cpu.h
        src/sys/arch/or1k/include: cpu.h
        src/sys/arch/powerpc/include: cpu.h
        src/sys/arch/riscv/include: cpu.h
        src/sys/arch/sparc/include: cpu.h
        src/sys/arch/sparc64/include: cpu.h
        src/sys/arch/vax/include: cpu.h
        src/sys/arch/x86/include: cpu.h
        src/sys/kern: kern_clock.c subr_prof.c
        src/sys/sys: gmon.h
        src/usr.sbin/kgmon: kgmon.8 kgmon.c

Log Message:
Improved the performance of kernel profiling on MULTIPROCESSOR, and possible to 
get profiling data for each CPU.

In the current implementation, locks are acquired at the entrance of the mcount
internal function, so the higher the number of cores, the more lock conflict
occurs, making profiling performance in a MULTIPROCESSOR environment unusable
and slow. Profiling buffers has been changed to be reserved for each CPU,
improving profiling performance in MP by several to several dozen times.

- Eliminated cpu_simple_lock in mcount internal function, using per-CPU buffers.
- Add ci_gmon member to struct cpu_info of each MP arch.
- Add kern.profiling.percpu node in sysctl tree.
- Add new -c <cpuid> option to kgmon(8) to specify the cpuid, like openbsd.
  For compatibility, if the -c option is not specified, the entire system can be
  operated as before, and the -p option will get the total profiling data for
  all CPUs.


To generate a diff of this commit:
cvs rdiff -u -r1.15 -r1.16 src/common/lib/libc/gmon/mcount.c
cvs rdiff -u -r1.37 -r1.38 src/sys/arch/aarch64/include/cpu.h
cvs rdiff -u -r1.103 -r1.104 src/sys/arch/alpha/include/cpu.h
cvs rdiff -u -r1.118 -r1.119 src/sys/arch/arm/include/cpu.h
cvs rdiff -u -r1.10 -r1.11 src/sys/arch/hppa/include/cpu.h
cvs rdiff -u -r1.132 -r1.133 src/sys/arch/mips/include/cpu.h
cvs rdiff -u -r1.4 -r1.5 src/sys/arch/or1k/include/cpu.h
cvs rdiff -u -r1.118 -r1.119 src/sys/arch/powerpc/include/cpu.h
cvs rdiff -u -r1.7 -r1.8 src/sys/arch/riscv/include/cpu.h
cvs rdiff -u -r1.109 -r1.110 src/sys/arch/sparc/include/cpu.h
cvs rdiff -u -r1.132 -r1.133 src/sys/arch/sparc64/include/cpu.h
cvs rdiff -u -r1.104 -r1.105 src/sys/arch/vax/include/cpu.h
cvs rdiff -u -r1.130 -r1.131 src/sys/arch/x86/include/cpu.h
cvs rdiff -u -r1.144 -r1.145 src/sys/kern/kern_clock.c
cvs rdiff -u -r1.49 -r1.50 src/sys/kern/subr_prof.c
cvs rdiff -u -r1.10 -r1.11 src/sys/sys/gmon.h
cvs rdiff -u -r1.19 -r1.20 src/usr.sbin/kgmon/kgmon.8
cvs rdiff -u -r1.26 -r1.27 src/usr.sbin/kgmon/kgmon.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/common/lib/libc/gmon/mcount.c
diff -u src/common/lib/libc/gmon/mcount.c:1.15 src/common/lib/libc/gmon/mcount.c:1.16
--- src/common/lib/libc/gmon/mcount.c:1.15	Sat Aug 14 17:38:44 2021
+++ src/common/lib/libc/gmon/mcount.c	Sat Aug 14 17:51:18 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: mcount.c,v 1.15 2021/08/14 17:38:44 ryo Exp $	*/
+/*	$NetBSD: mcount.c,v 1.16 2021/08/14 17:51:18 ryo Exp $	*/
 
 /*
  * Copyright (c) 2003, 2004 Wasabi Systems, Inc.
@@ -76,13 +76,14 @@
 #if 0
 static char sccsid[] = "@(#)mcount.c	8.1 (Berkeley) 6/4/93";
 #else
-__RCSID("$NetBSD: mcount.c,v 1.15 2021/08/14 17:38:44 ryo Exp $");
+__RCSID("$NetBSD: mcount.c,v 1.16 2021/08/14 17:51:18 ryo Exp $");
 #endif
 #endif
 
 #include <sys/param.h>
 #include <sys/gmon.h>
 #include <sys/lock.h>
+#include <sys/proc.h>
 
 #ifndef _KERNEL
 #include "reentrant.h"
@@ -94,10 +95,6 @@ extern struct gmonparam _gmondummy;
 struct gmonparam *_m_gmon_alloc(void);
 #endif
 
-#if defined(_KERNEL) && !defined(_RUMPKERNEL) && defined(MULTIPROCESSOR)
-__cpu_simple_lock_t __mcount_lock;
-#endif
-
 #ifndef __LINT__
 _MCOUNT_DECL(u_long, u_long)
 #ifdef _KERNEL
@@ -168,8 +165,11 @@ _MCOUNT_DECL(u_long frompc, u_long selfp
 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
 	MCOUNT_ENTER;
 #ifdef MULTIPROCESSOR
-	__cpu_simple_lock(&__mcount_lock);
-	__insn_barrier();
+	p = curcpu()->ci_gmon;
+	if (p == NULL || p->state != GMON_PROF_ON) {
+		MCOUNT_EXIT;
+		return;
+	}
 #endif
 #endif
 	p->state = GMON_PROF_BUSY;
@@ -264,10 +264,6 @@ _MCOUNT_DECL(u_long frompc, u_long selfp
 done:
 	p->state = GMON_PROF_ON;
 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
-#ifdef MULTIPROCESSOR
-	__insn_barrier();
-	__cpu_simple_unlock(&__mcount_lock);
-#endif
 	MCOUNT_EXIT;
 #endif
 	return;
@@ -275,10 +271,6 @@ done:
 overflow:
 	p->state = GMON_PROF_ERROR;
 #if defined(_KERNEL) && !defined(_RUMPKERNEL)
-#ifdef MULTIPROCESSOR
-	__insn_barrier();
-	__cpu_simple_unlock(&__mcount_lock);
-#endif
 	MCOUNT_EXIT;
 #endif
 	return;
@@ -293,4 +285,106 @@ overflow:
 MCOUNT
 #endif
 
+#if defined(_KERNEL) && !defined(_RUMPKERNEL) && defined(MULTIPROCESSOR)
+void _gmonparam_merge(struct gmonparam *, struct gmonparam *);
+
+void
+_gmonparam_merge(struct gmonparam *p, struct gmonparam *q)
+{
+	u_long fromindex;
+	u_short *frompcindex, qtoindex, toindex;
+	u_long selfpc;
+	u_long endfrom;
+	long count;
+	struct tostruct *top;
+	int i;
+
+	count = q->kcountsize / sizeof(*q->kcount);
+	for (i = 0; i < count; i++)
+		p->kcount[i] += q->kcount[i];
+
+	endfrom = (q->fromssize / sizeof(*q->froms));
+	for (fromindex = 0; fromindex < endfrom; fromindex++) {
+		if (q->froms[fromindex] == 0)
+			continue;
+		for (qtoindex = q->froms[fromindex]; qtoindex != 0;
+		     qtoindex = q->tos[qtoindex].link) {
+			selfpc = q->tos[qtoindex].selfpc;
+			count = q->tos[qtoindex].count;
+			/* cribbed from mcount */
+			frompcindex = &p->froms[fromindex];
+			toindex = *frompcindex;
+			if (toindex == 0) {
+				/*
+				 * first time traversing this arc
+				 */
+				toindex = ++p->tos[0].link;
+				if (toindex >= p->tolimit)
+					/* halt further profiling */
+					goto overflow;
+
+				*frompcindex = (u_short)toindex;
+				top = &p->tos[(size_t)toindex];
+				top->selfpc = selfpc;
+				top->count = count;
+				top->link = 0;
+				goto done;
+			}
+			top = &p->tos[(size_t)toindex];
+			if (top->selfpc == selfpc) {
+				/*
+				 * arc at front of chain; usual case.
+				 */
+				top->count+= count;
+				goto done;
+			}
+			/*
+			 * have to go looking down chain for it.
+			 * top points to what we are looking at,
+			 * we know it is not at the head of the chain.
+			 */
+			for (; /* goto done */; ) {
+				if (top->link == 0) {
+					/*
+					 * top is end of the chain and
+					 * none of the chain had
+					 * top->selfpc == selfpc.  so
+					 * we allocate a new tostruct
+					 * and link it to the head of
+					 * the chain.
+					 */
+					toindex = ++p->tos[0].link;
+					if (toindex >= p->tolimit)
+						goto overflow;
+
+					top = &p->tos[(size_t)toindex];
+					top->selfpc = selfpc;
+					top->count = count;
+					top->link = *frompcindex;
+					*frompcindex = (u_short)toindex;
+					goto done;
+				}
+				/*
+				 * otherwise, check the next arc on the chain.
+				 */
+				top = &p->tos[top->link];
+				if (top->selfpc == selfpc) {
+					/*
+					 * there it is.
+					 * add to its count.
+					 */
+					top->count += count;
+					goto done;
+				}
+			}
+
+		done: ;
+		}
+
+	}
+ overflow: ;
+
+}
+#endif
+
 #endif /* (!_KERNEL || GPROF) && !_STANDALONE */

Index: src/sys/arch/aarch64/include/cpu.h
diff -u src/sys/arch/aarch64/include/cpu.h:1.37 src/sys/arch/aarch64/include/cpu.h:1.38
--- src/sys/arch/aarch64/include/cpu.h:1.37	Sun Aug  8 19:28:08 2021
+++ src/sys/arch/aarch64/include/cpu.h	Sat Aug 14 17:51:18 2021
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.h,v 1.37 2021/08/08 19:28:08 skrll Exp $ */
+/* $NetBSD: cpu.h,v 1.38 2021/08/14 17:51:18 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014, 2020 The NetBSD Foundation, Inc.
@@ -37,6 +37,7 @@
 #ifdef __aarch64__
 
 #ifdef _KERNEL_OPT
+#include "opt_gprof.h"
 #include "opt_multiprocessor.h"
 #endif
 
@@ -133,6 +134,9 @@ struct cpu_info {
 	struct aarch64_cache_info *ci_cacheinfo;
 	struct aarch64_cpufuncs ci_cpufuncs;
 
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 } __aligned(COHERENCY_UNIT);
 
 #ifdef _KERNEL

Index: src/sys/arch/alpha/include/cpu.h
diff -u src/sys/arch/alpha/include/cpu.h:1.103 src/sys/arch/alpha/include/cpu.h:1.104
--- src/sys/arch/alpha/include/cpu.h:1.103	Thu Jul 22 01:39:18 2021
+++ src/sys/arch/alpha/include/cpu.h	Sat Aug 14 17:51:18 2021
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.h,v 1.103 2021/07/22 01:39:18 thorpej Exp $ */
+/* $NetBSD: cpu.h,v 1.104 2021/08/14 17:51:18 ryo Exp $ */
 
 /*-
  * Copyright (c) 1998, 1999, 2000, 2001 The NetBSD Foundation, Inc.
@@ -72,6 +72,7 @@
 #define _ALPHA_CPU_H_
 
 #if defined(_KERNEL_OPT)
+#include "opt_gprof.h"
 #include "opt_multiprocessor.h"
 #include "opt_lockdebug.h"
 #endif
@@ -140,6 +141,9 @@ struct cpu_info {
 	uint64_t ci_pcc_freq;		/* cpu cycles/second */
 	struct trapframe *ci_db_regs;	/* registers for debuggers */
 	u_int	ci_nintrhand;		/* # of interrupt handlers */
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* [MI] per-cpu GPROF */
+#endif
 };
 
 /* Ensure some cpu_info fields are within the signed 16-bit displacement. */

Index: src/sys/arch/arm/include/cpu.h
diff -u src/sys/arch/arm/include/cpu.h:1.118 src/sys/arch/arm/include/cpu.h:1.119
--- src/sys/arch/arm/include/cpu.h:1.118	Sun Aug  8 19:28:08 2021
+++ src/sys/arch/arm/include/cpu.h	Sat Aug 14 17:51:18 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.118 2021/08/08 19:28:08 skrll Exp $	*/
+/*	$NetBSD: cpu.h,v 1.119 2021/08/14 17:51:18 ryo Exp $	*/
 
 /*
  * Copyright (c) 1994-1996 Mark Brinicombe.
@@ -92,6 +92,7 @@ void	cpu_proc_fork(struct proc *, struct
  */
 
 #if !defined(_MODULE) && defined(_KERNEL_OPT)
+#include "opt_gprof.h"
 #include "opt_multiprocessor.h"
 #include "opt_cpuoptions.h"
 #include "opt_lockdebug.h"
@@ -223,6 +224,10 @@ struct cpu_info {
 
 	struct arm_cache_info *
 			ci_cacheinfo;
+
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 };
 
 extern struct cpu_info cpu_info_store[];

Index: src/sys/arch/hppa/include/cpu.h
diff -u src/sys/arch/hppa/include/cpu.h:1.10 src/sys/arch/hppa/include/cpu.h:1.11
--- src/sys/arch/hppa/include/cpu.h:1.10	Thu Apr 16 09:28:52 2020
+++ src/sys/arch/hppa/include/cpu.h	Sat Aug 14 17:51:19 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.10 2020/04/16 09:28:52 skrll Exp $	*/
+/*	$NetBSD: cpu.h,v 1.11 2021/08/14 17:51:19 ryo Exp $	*/
 
 /*	$OpenBSD: cpu.h,v 1.55 2008/07/23 17:39:35 kettenis Exp $	*/
 
@@ -55,6 +55,7 @@
 
 #ifdef _KERNEL_OPT
 #include "opt_cputype.h"
+#include "opt_gprof.h"
 #include "opt_multiprocessor.h"
 #endif
 
@@ -300,7 +301,9 @@ struct cpu_info {
 
 	struct cpu_softc *ci_softc;
 #endif
-
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 #endif /* !_KMEMUSER */
 } __aligned(64);
 

Index: src/sys/arch/mips/include/cpu.h
diff -u src/sys/arch/mips/include/cpu.h:1.132 src/sys/arch/mips/include/cpu.h:1.133
--- src/sys/arch/mips/include/cpu.h:1.132	Mon Mar 29 01:47:45 2021
+++ src/sys/arch/mips/include/cpu.h	Sat Aug 14 17:51:19 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.132 2021/03/29 01:47:45 simonb Exp $	*/
+/*	$NetBSD: cpu.h,v 1.133 2021/08/14 17:51:19 ryo Exp $	*/
 
 /*-
  * Copyright (c) 1992, 1993
@@ -49,6 +49,7 @@
 
 #if defined(_KERNEL_OPT)
 #include "opt_cputype.h"
+#include "opt_gprof.h"
 #include "opt_lockdebug.h"
 #include "opt_multiprocessor.h"
 #endif
@@ -159,6 +160,9 @@ struct cpu_info {
 	kcpuset_t *ci_watchcpus;
 	kcpuset_t *ci_ddbcpus;
 #endif
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 
 };
 #endif /* _KERNEL || _KMEMUSER */

Index: src/sys/arch/or1k/include/cpu.h
diff -u src/sys/arch/or1k/include/cpu.h:1.4 src/sys/arch/or1k/include/cpu.h:1.5
--- src/sys/arch/or1k/include/cpu.h:1.4	Sun Dec  1 15:34:45 2019
+++ src/sys/arch/or1k/include/cpu.h	Sat Aug 14 17:51:19 2021
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.h,v 1.4 2019/12/01 15:34:45 ad Exp $ */
+/* $NetBSD: cpu.h,v 1.5 2021/08/14 17:51:19 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -64,6 +64,9 @@ struct cpu_info {
 	int ci_cpl;
 	u_int ci_softints;
 	volatile u_int ci_intr_depth;
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 };
 
 register struct lwp *or1k_curlwp __asm("r10");

Index: src/sys/arch/powerpc/include/cpu.h
diff -u src/sys/arch/powerpc/include/cpu.h:1.118 src/sys/arch/powerpc/include/cpu.h:1.119
--- src/sys/arch/powerpc/include/cpu.h:1.118	Sun Mar  7 14:42:53 2021
+++ src/sys/arch/powerpc/include/cpu.h	Sat Aug 14 17:51:19 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.118 2021/03/07 14:42:53 rin Exp $	*/
+/*	$NetBSD: cpu.h,v 1.119 2021/08/14 17:51:19 ryo Exp $	*/
 
 /*
  * Copyright (C) 1999 Wolfgang Solfrank.
@@ -45,6 +45,7 @@ struct cache_info {
 
 #if defined(_KERNEL) || defined(_KMEMUSER)
 #if defined(_KERNEL_OPT)
+#include "opt_gprof.h"
 #include "opt_modular.h"
 #include "opt_multiprocessor.h"
 #include "opt_ppcarch.h"
@@ -159,6 +160,9 @@ struct cpu_info {
 	struct evcnt ci_ev_tlbmiss_soft; /* tlb miss (no trap) */
 	struct evcnt ci_ev_dtlbmiss_hard; /* data tlb miss (trap) */
 	struct evcnt ci_ev_itlbmiss_hard; /* instruction tlb miss (trap) */
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 #endif /* _KERNEL */
 };
 #endif /* _KERNEL || _KMEMUSER */

Index: src/sys/arch/riscv/include/cpu.h
diff -u src/sys/arch/riscv/include/cpu.h:1.7 src/sys/arch/riscv/include/cpu.h:1.8
--- src/sys/arch/riscv/include/cpu.h:1.7	Sun Dec  1 15:34:45 2019
+++ src/sys/arch/riscv/include/cpu.h	Sat Aug 14 17:51:19 2021
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.h,v 1.7 2019/12/01 15:34:45 ad Exp $ */
+/* $NetBSD: cpu.h,v 1.8 2021/08/14 17:51:19 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -78,6 +78,9 @@ struct cpu_info {
 	struct evcnt ci_ev_fpu_saves;
 	struct evcnt ci_ev_fpu_loads;
 	struct evcnt ci_ev_fpu_reenables;
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 };
 
 #endif /* _KERNEL || _KMEMUSER */

Index: src/sys/arch/sparc/include/cpu.h
diff -u src/sys/arch/sparc/include/cpu.h:1.109 src/sys/arch/sparc/include/cpu.h:1.110
--- src/sys/arch/sparc/include/cpu.h:1.109	Sun Jan 24 07:36:54 2021
+++ src/sys/arch/sparc/include/cpu.h	Sat Aug 14 17:51:19 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.109 2021/01/24 07:36:54 mrg Exp $ */
+/*	$NetBSD: cpu.h,v 1.110 2021/08/14 17:51:19 ryo Exp $ */
 
 /*
  * Copyright (c) 1992, 1993
@@ -120,6 +120,7 @@ struct cacheinfo {
 #if defined(_KERNEL) || defined(_KMEMUSER)
 
 #if defined(_KERNEL_OPT)
+#include "opt_gprof.h"
 #include "opt_multiprocessor.h"
 #include "opt_lockdebug.h"
 #include "opt_sparc_arch.h"
@@ -395,6 +396,10 @@ struct cpu_info {
 	struct evcnt ci_sintrcnt[16];
 
 	struct cpu_data ci_data;	/* MI per-cpu data */
+
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 };
 
 #endif /* _KERNEL || _KMEMUSER */

Index: src/sys/arch/sparc64/include/cpu.h
diff -u src/sys/arch/sparc64/include/cpu.h:1.132 src/sys/arch/sparc64/include/cpu.h:1.133
--- src/sys/arch/sparc64/include/cpu.h:1.132	Mon Apr  5 22:36:27 2021
+++ src/sys/arch/sparc64/include/cpu.h	Sat Aug 14 17:51:19 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.132 2021/04/05 22:36:27 nakayama Exp $ */
+/*	$NetBSD: cpu.h,v 1.133 2021/08/14 17:51:19 ryo Exp $ */
 
 /*
  * Copyright (c) 1992, 1993
@@ -70,6 +70,7 @@ struct cacheinfo {
  */
 
 #if defined(_KERNEL_OPT)
+#include "opt_gprof.h"
 #include "opt_multiprocessor.h"
 #include "opt_lockdebug.h"
 #endif
@@ -220,6 +221,10 @@ struct cpu_info {
 	volatile void		*ci_ddb_regs;	/* DDB regs */
 
 	void (*ci_idlespin)(void);
+
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 };
 
 #endif /* _KERNEL || _KMEMUSER */

Index: src/sys/arch/vax/include/cpu.h
diff -u src/sys/arch/vax/include/cpu.h:1.104 src/sys/arch/vax/include/cpu.h:1.105
--- src/sys/arch/vax/include/cpu.h:1.104	Sun Dec  1 15:34:46 2019
+++ src/sys/arch/vax/include/cpu.h	Sat Aug 14 17:51:19 2021
@@ -1,4 +1,4 @@
-/*      $NetBSD: cpu.h,v 1.104 2019/12/01 15:34:46 ad Exp $      */
+/*      $NetBSD: cpu.h,v 1.105 2021/08/14 17:51:19 ryo Exp $      */
 
 /*
  * Copyright (c) 1994 Ludd, University of Lule}, Sweden
@@ -29,6 +29,7 @@
 #define _VAX_CPU_H_
 
 #if defined(_KERNEL_OPT)
+#include "opt_gprof.h"
 #include "opt_multiprocessor.h"
 #include "opt_lockdebug.h"
 #endif
@@ -142,6 +143,9 @@ struct cpu_info {
 	SIMPLEQ_ENTRY(cpu_info) ci_next; /* next cpu_info */
 #endif
 	uintptr_t ci_cas_addr;		/* current address doing CAS in a RAS */
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 };
 #define	CI_MASTERCPU	1		/* Set if master CPU */
 #define	CI_RUNNING	2		/* Set when a slave CPU is running */

Index: src/sys/arch/x86/include/cpu.h
diff -u src/sys/arch/x86/include/cpu.h:1.130 src/sys/arch/x86/include/cpu.h:1.131
--- src/sys/arch/x86/include/cpu.h:1.130	Fri Feb 19 02:15:24 2021
+++ src/sys/arch/x86/include/cpu.h	Sat Aug 14 17:51:20 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: cpu.h,v 1.130 2021/02/19 02:15:24 christos Exp $	*/
+/*	$NetBSD: cpu.h,v 1.131 2021/08/14 17:51:20 ryo Exp $	*/
 
 /*
  * Copyright (c) 1990 The Regents of the University of California.
@@ -321,6 +321,10 @@ struct cpu_info {
 	struct evcnt	ci_xen_systime_backwards_hardclock_evcnt;
 	struct evcnt	ci_xen_missed_hardclock_evcnt;
 #endif	/* XEN */
+
+#if defined(GPROF) && defined(MULTIPROCESSOR)
+	struct gmonparam *ci_gmon;	/* MI per-cpu GPROF */
+#endif
 };
 
 #if defined(XEN) && !defined(XENPV)

Index: src/sys/kern/kern_clock.c
diff -u src/sys/kern/kern_clock.c:1.144 src/sys/kern/kern_clock.c:1.145
--- src/sys/kern/kern_clock.c:1.144	Sat Jan 16 02:20:00 2021
+++ src/sys/kern/kern_clock.c	Sat Aug 14 17:51:20 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: kern_clock.c,v 1.144 2021/01/16 02:20:00 riastradh Exp $	*/
+/*	$NetBSD: kern_clock.c,v 1.145 2021/08/14 17:51:20 ryo Exp $	*/
 
 /*-
  * Copyright (c) 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc.
@@ -69,11 +69,12 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.144 2021/01/16 02:20:00 riastradh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: kern_clock.c,v 1.145 2021/08/14 17:51:20 ryo Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_dtrace.h"
 #include "opt_gprof.h"
+#include "opt_multiprocessor.h"
 #endif
 
 #include <sys/param.h>
@@ -456,8 +457,14 @@ statclock(struct clockframe *frame)
 		/*
 		 * Kernel statistics are just like addupc_intr, only easier.
 		 */
+#ifdef MULTIPROCESSOR
+		g = curcpu()->ci_gmon;
+		if (g != NULL &&
+		    profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
+#else
 		g = &_gmonparam;
 		if (profsrc == PROFSRC_CLOCK && g->state == GMON_PROF_ON) {
+#endif
 			i = CLKF_PC(frame) - g->lowpc;
 			if (i < g->textsize) {
 				i /= HISTFRACTION * sizeof(*g->kcount);

Index: src/sys/kern/subr_prof.c
diff -u src/sys/kern/subr_prof.c:1.49 src/sys/kern/subr_prof.c:1.50
--- src/sys/kern/subr_prof.c:1.49	Sat Apr  6 03:06:28 2019
+++ src/sys/kern/subr_prof.c	Sat Aug 14 17:51:20 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: subr_prof.c,v 1.49 2019/04/06 03:06:28 thorpej Exp $	*/
+/*	$NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $	*/
 
 /*-
  * Copyright (c) 1982, 1986, 1993
@@ -32,10 +32,11 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: subr_prof.c,v 1.49 2019/04/06 03:06:28 thorpej Exp $");
+__KERNEL_RCSID(0, "$NetBSD: subr_prof.c,v 1.50 2021/08/14 17:51:20 ryo Exp $");
 
 #ifdef _KERNEL_OPT
 #include "opt_gprof.h"
+#include "opt_multiprocessor.h"
 #endif
 
 #include <sys/param.h>
@@ -51,9 +52,15 @@ __KERNEL_RCSID(0, "$NetBSD: subr_prof.c,
 #ifdef GPROF
 #include <sys/malloc.h>
 #include <sys/gmon.h>
+#include <sys/xcall.h>
 
 MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer");
 
+static int sysctl_kern_profiling(SYSCTLFN_ARGS);
+#ifdef MULTIPROCESSOR
+void _gmonparam_merge(struct gmonparam *, struct gmonparam *);
+#endif
+
 /*
  * Froms is actually a bunch of unsigned shorts indexing tos
  */
@@ -70,6 +77,7 @@ kmstartup(void)
 {
 	char *cp;
 	struct gmonparam *p = &_gmonparam;
+	unsigned long size;
 	/*
 	 * Round lowpc and highpc to multiples of the density we're using
 	 * so the rest of the scaling (here and in gprof) stays in ints.
@@ -90,8 +98,101 @@ kmstartup(void)
 	else if (p->tolimit > MAXARCS)
 		p->tolimit = MAXARCS;
 	p->tossize = p->tolimit * sizeof(struct tostruct);
-	cp = malloc(p->kcountsize + p->fromssize + p->tossize,
-	    M_GPROF, M_NOWAIT | M_ZERO);
+
+	size = p->kcountsize + p->fromssize + p->tossize;
+#ifdef MULTIPROCESSOR
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci;
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		p = malloc(sizeof(struct gmonparam) + size, M_GPROF,
+		    M_NOWAIT | M_ZERO);
+		if (p == NULL) {
+			printf("No memory for profiling on %s\n",
+			    cpu_name(ci));
+			/* cannot profile on this cpu */
+			continue;
+		}
+		memcpy(p, &_gmonparam, sizeof(_gmonparam));
+		ci->ci_gmon = p;
+
+		/*
+		 * To allow profiling to be controlled only by the global
+		 * _gmonparam.state, set the default value for each CPU to
+		 * GMON_PROF_ON. If _gmonparam.state is not ON, mcount will
+		 * not be executed.
+		 * This is For compatibility of the kgmon(8) kmem interface.
+		 */
+		p->state = GMON_PROF_ON;
+
+		cp = (char *)(p + 1);
+		p->tos = (struct tostruct *)cp;
+		p->kcount = (u_short *)(cp + p->tossize);
+		p->froms = (u_short *)(cp + p->tossize + p->kcountsize);
+	}
+
+	sysctl_createv(NULL, 0, NULL, NULL,
+	    0, CTLTYPE_NODE, "percpu",
+	    SYSCTL_DESCR("per cpu profiling information"),
+	    NULL, 0, NULL, 0,
+	    CTL_KERN, KERN_PROF, GPROF_PERCPU, CTL_EOL);
+
+	for (CPU_INFO_FOREACH(cii, ci)) {
+		if (ci->ci_gmon == NULL)
+			continue;
+
+		sysctl_createv(NULL, 0, NULL, NULL,
+		    0, CTLTYPE_NODE, cpu_name(ci),
+		    NULL,
+		    NULL, 0, NULL, 0,
+		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci), CTL_EOL);
+
+		sysctl_createv(NULL, 0, NULL, NULL,
+		    CTLFLAG_READWRITE, CTLTYPE_INT, "state",
+		    SYSCTL_DESCR("Profiling state"),
+		    sysctl_kern_profiling, 0, (void *)ci, 0,
+		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
+		    GPROF_STATE, CTL_EOL);
+		sysctl_createv(NULL, 0, NULL, NULL,
+		    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "count",
+		    SYSCTL_DESCR("Array of statistical program counters"),
+		    sysctl_kern_profiling, 0, (void *)ci, 0,
+		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
+		    GPROF_COUNT, CTL_EOL);
+		sysctl_createv(NULL, 0, NULL, NULL,
+		    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "froms",
+		    SYSCTL_DESCR("Array indexed by program counter of "
+		    "call-from points"),
+		    sysctl_kern_profiling, 0, (void *)ci, 0,
+		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
+		    GPROF_FROMS, CTL_EOL);
+		sysctl_createv(NULL, 0, NULL, NULL,
+		    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "tos",
+		    SYSCTL_DESCR("Array of structures describing "
+		    "destination of calls and their counts"),
+		    sysctl_kern_profiling, 0, (void *)ci, 0,
+		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
+		    GPROF_TOS, CTL_EOL);
+		sysctl_createv(NULL, 0, NULL, NULL,
+		    CTLFLAG_READWRITE, CTLTYPE_STRUCT, "gmonparam",
+		    SYSCTL_DESCR("Structure giving the sizes of the above "
+		    "arrays"),
+		    sysctl_kern_profiling, 0, (void *)ci, 0,
+		    CTL_KERN, KERN_PROF, GPROF_PERCPU, cpu_index(ci),
+		    GPROF_GMONPARAM, CTL_EOL);
+	}
+
+	/*
+	 * For minimal compatibility of the kgmon(8) kmem interface,
+	 * the _gmonparam and cpu0:ci_gmon share buffers.
+	 */
+	p = curcpu()->ci_gmon;
+	if (p != NULL) {
+		_gmonparam.tos = p->tos;
+		_gmonparam.kcount = p->kcount;
+		_gmonparam.froms = p->froms;
+	}
+#else /* MULTIPROCESSOR */
+	cp = malloc(size, M_GPROF, M_NOWAIT | M_ZERO);
 	if (cp == 0) {
 		printf("No memory for profiling.\n");
 		return;
@@ -101,7 +202,20 @@ kmstartup(void)
 	p->kcount = (u_short *)cp;
 	cp += p->kcountsize;
 	p->froms = (u_short *)cp;
+#endif /* MULTIPROCESSOR */
+}
+
+#ifdef MULTIPROCESSOR
+static void
+prof_set_state_xc(void *arg1, void *arg2 __unused)
+{
+	int state = PTRTOUINT64(arg1);
+	struct gmonparam *gp = curcpu()->ci_gmon;
+
+	if (gp != NULL)
+		gp->state = state;
 }
+#endif /* MULTIPROCESSOR */
 
 /*
  * Return kernel profiling information.
@@ -113,15 +227,72 @@ kmstartup(void)
 static int
 sysctl_kern_profiling(SYSCTLFN_ARGS)
 {
-	struct gmonparam *gp = &_gmonparam;
+	struct sysctlnode node = *rnode;
+	struct gmonparam *gp;
 	int error;
-	struct sysctlnode node;
-
-	node = *rnode;
+#ifdef MULTIPROCESSOR
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci, *target_ci;
+	uint64_t where;
+	int state;
+	bool prof_on, do_merge;
+
+	target_ci = (struct cpu_info *)rnode->sysctl_data;
+	do_merge = (oldp != NULL) && (target_ci == NULL) &&
+	    ((node.sysctl_num == GPROF_COUNT) ||
+	    (node.sysctl_num == GPROF_FROMS) ||
+	    (node.sysctl_num == GPROF_TOS));
+
+	if (do_merge) {
+		/* kern.profiling.{count,froms,tos} */
+		unsigned long size;
+		char *cp;
+
+		/* allocate temporary gmonparam, and merge results of all CPU */
+		size = _gmonparam.kcountsize + _gmonparam.fromssize +
+		    _gmonparam.tossize;
+		gp = malloc(sizeof(struct gmonparam) + size, M_GPROF,
+		    M_NOWAIT | M_ZERO);
+		if (gp == NULL)
+			return ENOMEM;
+		memcpy(gp, &_gmonparam, sizeof(_gmonparam));
+		cp = (char *)(gp + 1);
+		gp->tos = (struct tostruct *)cp;
+		gp->kcount = (u_short *)(cp + gp->tossize);
+		gp->froms = (u_short *)(cp + gp->tossize + gp->kcountsize);
+
+		for (CPU_INFO_FOREACH(cii, ci)) {
+			if (ci->ci_gmon == NULL)
+				continue;
+			_gmonparam_merge(gp, ci->ci_gmon);
+		}
+	} else if (target_ci != NULL) {
+		/* kern.profiling.percpu.* */
+		gp = target_ci->ci_gmon;
+	} else {
+		/* kern.profiling.{state,gmonparam} */
+		gp = &_gmonparam;
+	}
+#else /* MULTIPROCESSOR */
+	gp = &_gmonparam;
+#endif
 
 	switch (node.sysctl_num) {
 	case GPROF_STATE:
+#ifdef MULTIPROCESSOR
+		/*
+		 * if _gmonparam.state is OFF, the state of each CPU is
+		 * considered to be OFF, even if it is actually ON.
+		 */
+		if (_gmonparam.state == GMON_PROF_OFF ||
+		    gp->state == GMON_PROF_OFF)
+			state = GMON_PROF_OFF;
+		else
+			state = GMON_PROF_ON;
+		node.sysctl_data = &state;
+#else
 		node.sysctl_data = &gp->state;
+#endif
 		break;
 	case GPROF_COUNT:
 		node.sysctl_data = gp->kcount;
@@ -145,8 +316,97 @@ sysctl_kern_profiling(SYSCTLFN_ARGS)
 
 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
 	if (error || newp == NULL)
-		return (error);
+		goto done;
 
+#ifdef MULTIPROCESSOR
+	switch (node.sysctl_num) {
+	case GPROF_STATE:
+		if (target_ci != NULL) {
+			where = xc_unicast(0, prof_set_state_xc,
+			    UINT64TOPTR(state), NULL, target_ci);
+			xc_wait(where);
+
+			/* if even one CPU being profiled, enable perfclock. */
+			prof_on = false;
+			for (CPU_INFO_FOREACH(cii, ci)) {
+				if (ci->ci_gmon == NULL)
+					continue;
+				if (ci->ci_gmon->state != GMON_PROF_OFF) {
+					prof_on = true;
+					break;
+				}
+			}
+			mutex_spin_enter(&proc0.p_stmutex);
+			if (prof_on)
+				startprofclock(&proc0);
+			else
+				stopprofclock(&proc0);
+			mutex_spin_exit(&proc0.p_stmutex);
+
+			if (prof_on) {
+				_gmonparam.state = GMON_PROF_ON;
+			} else {
+				_gmonparam.state = GMON_PROF_OFF;
+				/*
+				 * when _gmonparam.state and all CPU gmon state
+				 * are OFF, all CPU states should be ON so that
+				 * the entire CPUs profiling can be controlled
+				 * by _gmonparam.state only.
+				 */
+				for (CPU_INFO_FOREACH(cii, ci)) {
+					if (ci->ci_gmon == NULL)
+						continue;
+					ci->ci_gmon->state = GMON_PROF_ON;
+				}
+			}
+		} else {
+			_gmonparam.state = state;
+			where = xc_broadcast(0, prof_set_state_xc,
+			    UINT64TOPTR(state), NULL);
+			xc_wait(where);
+
+			mutex_spin_enter(&proc0.p_stmutex);
+			if (state == GMON_PROF_OFF)
+				stopprofclock(&proc0);
+			else
+				startprofclock(&proc0);
+			mutex_spin_exit(&proc0.p_stmutex);
+		}
+		break;
+	case GPROF_COUNT:
+		/*
+		 * if 'kern.profiling.{count,froms,tos}' is written, the same
+		 * data will be written to 'kern.profiling.percpu.cpuN.xxx'
+		 */
+		if (target_ci == NULL) {
+			for (CPU_INFO_FOREACH(cii, ci)) {
+				if (ci->ci_gmon == NULL)
+					continue;
+				memmove(ci->ci_gmon->kcount, gp->kcount,
+				    newlen);
+			}
+		}
+		break;
+	case GPROF_FROMS:
+		if (target_ci == NULL) {
+			for (CPU_INFO_FOREACH(cii, ci)) {
+				if (ci->ci_gmon == NULL)
+					continue;
+				memmove(ci->ci_gmon->froms, gp->froms, newlen);
+			}
+		}
+		break;
+	case GPROF_TOS:
+		if (target_ci == NULL) {
+			for (CPU_INFO_FOREACH(cii, ci)) {
+				if (ci->ci_gmon == NULL)
+					continue;
+				memmove(ci->ci_gmon->tos, gp->tos, newlen);
+			}
+		}
+		break;
+	}
+#else
 	if (node.sysctl_num == GPROF_STATE) {
 		mutex_spin_enter(&proc0.p_stmutex);
 		if (gp->state == GMON_PROF_OFF)
@@ -155,8 +415,14 @@ sysctl_kern_profiling(SYSCTLFN_ARGS)
 			startprofclock(&proc0);
 		mutex_spin_exit(&proc0.p_stmutex);
 	}
+#endif
 
-	return (0);
+ done:
+#ifdef MULTIPROCESSOR
+	if (do_merge)
+		free(gp, M_GPROF);
+#endif
+	return error;
 }
 
 SYSCTL_SETUP(sysctl_kern_gprof_setup, "sysctl kern.profiling subtree setup")

Index: src/sys/sys/gmon.h
diff -u src/sys/sys/gmon.h:1.10 src/sys/sys/gmon.h:1.11
--- src/sys/sys/gmon.h:1.10	Fri Mar  9 15:38:03 2012
+++ src/sys/sys/gmon.h	Sat Aug 14 17:51:20 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: gmon.h,v 1.10 2012/03/09 15:38:03 christos Exp $	*/
+/*	$NetBSD: gmon.h,v 1.11 2021/08/14 17:51:20 ryo Exp $	*/
 
 /*-
  * Copyright (c) 1982, 1986, 1992, 1993
@@ -150,4 +150,5 @@ extern struct gmonparam _gmonparam;
 #define	GPROF_FROMS	2	/* struct: from location hash bucket */
 #define	GPROF_TOS	3	/* struct: destination/count structure */
 #define	GPROF_GMONPARAM	4	/* struct: profiling parameters (see above) */
+#define	GPROF_PERCPU	5	/* per cpu node */
 #endif /* !_SYS_GMON_H_ */

Index: src/usr.sbin/kgmon/kgmon.8
diff -u src/usr.sbin/kgmon/kgmon.8:1.19 src/usr.sbin/kgmon/kgmon.8:1.20
--- src/usr.sbin/kgmon/kgmon.8:1.19	Mon Apr 25 22:46:35 2011
+++ src/usr.sbin/kgmon/kgmon.8	Sat Aug 14 17:51:20 2021
@@ -1,4 +1,4 @@
-.\"	$NetBSD: kgmon.8,v 1.19 2011/04/25 22:46:35 wiz Exp $
+.\"	$NetBSD: kgmon.8,v 1.20 2021/08/14 17:51:20 ryo Exp $
 .\"
 .\" Copyright (c) 1983, 1991, 1993
 .\"	The Regents of the University of California.  All rights reserved.
@@ -29,7 +29,7 @@
 .\"
 .\"     from: @(#)kgmon.8	8.1 (Berkeley) 6/6/93
 .\"
-.Dd June 6, 1993
+.Dd August 10, 2021
 .Dt KGMON 8
 .Os
 .Sh NAME
@@ -38,6 +38,7 @@
 .Sh SYNOPSIS
 .Nm
 .Op Fl bdhpr
+.Op Fl c Ar cpuid
 .Op Fl M Ar core
 .Op Fl N Ar system
 .Sh DESCRIPTION
@@ -55,6 +56,8 @@ flag is specified,
 .Nm
 extracts profile data from the operating system and produces a
 .Pa gmon.out
+or
+.Pa gmon-<id>.out
 file suitable for later analysis by
 .Xr gprof 1 .
 .Pp
@@ -62,6 +65,18 @@ The options are as follows:
 .Bl -tag -width Ds
 .It Fl b
 Resume the collection of profile data.
+.It Fl c Ar cpuid
+Operate on the CPU specified by
+.Pa cpuid .
+If this option is specified with
+.Fl p ,
+the output file name will be
+.Pa gmon-<id>.out
+instead of
+.Pa gmon.out .
+And if the cpuid argument is
+.Dq all
+, the operation is performed for each cpu.
 .It Fl d
 Enable debug output.
 .It Fl h
@@ -79,6 +94,8 @@ instead of the default
 .It Fl p
 Dump the contents of the profile buffers into a
 .Pa gmon.out
+or
+.Pa gmon-<id>.out
 file.
 .It Fl r
 Reset all the profile buffers.
@@ -86,6 +103,8 @@ If the
 .Fl p
 flag is also specified, the
 .Pa gmon.out
+or
+.Pa gmon-<id>.out
 file is generated before the buffers are reset.
 .El
 .Pp

Index: src/usr.sbin/kgmon/kgmon.c
diff -u src/usr.sbin/kgmon/kgmon.c:1.26 src/usr.sbin/kgmon/kgmon.c:1.27
--- src/usr.sbin/kgmon/kgmon.c:1.26	Thu Jul 11 03:49:52 2019
+++ src/usr.sbin/kgmon/kgmon.c	Sat Aug 14 17:51:20 2021
@@ -1,4 +1,4 @@
-/*	$NetBSD: kgmon.c,v 1.26 2019/07/11 03:49:52 msaitoh Exp $	*/
+/*	$NetBSD: kgmon.c,v 1.27 2021/08/14 17:51:20 ryo Exp $	*/
 
 /*
  * Copyright (c) 1983, 1992, 1993
@@ -39,7 +39,7 @@ __COPYRIGHT("@(#) Copyright (c) 1983, 19
 #if 0
 static char sccsid[] = "from: @(#)kgmon.c	8.1 (Berkeley) 6/6/93";
 #else
-__RCSID("$NetBSD: kgmon.c,v 1.26 2019/07/11 03:49:52 msaitoh Exp $");
+__RCSID("$NetBSD: kgmon.c,v 1.27 2021/08/14 17:51:20 ryo Exp $");
 #endif
 #endif /* not lint */
 
@@ -72,28 +72,35 @@ struct kvmvars {
 	struct gmonparam gpm;
 };
 
-static int	bflag, hflag, kflag, rflag, pflag;
+static int	ncpu_mib[2] = { CTL_HW, HW_NCPU };
+static int	bflag, cflag_all, hflag, kflag, rflag, pflag;
 static int	debug = 0;
-static void	setprof(struct kvmvars *kvp, int state);
-static void	dumpstate(struct kvmvars *kvp);
-static void	reset(struct kvmvars *kvp);
+static void	setprof(struct kvmvars *kvp, int state, int cpuid);
+static void	dumpstate(struct kvmvars *kvp, int cpuid);
+static void	reset(struct kvmvars *kvp, int cpuid);
 static int	openfiles(char *, char *, struct kvmvars *);
-static int	getprof(struct kvmvars *);
+static int	getprof(struct kvmvars *, int);
 static void	kern_readonly(int);
 static int	getprofhz(struct kvmvars *);
 
 int
 main(int argc, char **argv)
 {
-	int ch, mode, disp, accessmode;
+	int ch, mode, disp, accessmode, ncpu, cpuid = -1;
 	struct kvmvars kvmvars;
+	size_t size;
 	char *sys, *kmemf;
+	char on_cpu[sizeof(" on cpuXXXXXXXXX")];
+
+	size = sizeof(ncpu);
+	if (sysctl(ncpu_mib, 2, &ncpu, &size, NULL, 0) < 0)
+		ncpu = 1;
 
 	setprogname(argv[0]);
 	(void)seteuid(getuid());
 	kmemf = NULL;
 	sys = NULL;
-	while ((ch = getopt(argc, argv, "M:N:bdhpr")) != -1) {
+	while ((ch = getopt(argc, argv, "M:N:bc:dhpr")) != -1) {
 		switch((char)ch) {
 
 		case 'M':
@@ -109,6 +116,17 @@ main(int argc, char **argv)
 			bflag = 1;
 			break;
 
+		case 'c':
+			if (strcmp(optarg, "all") == 0) {
+				cflag_all = 1;
+				cpuid = 0;
+			} else {
+				cpuid = strtol(optarg, NULL, 10);
+				if (cpuid >= ncpu)
+					errx(1, "illegal CPU id %s", optarg);
+			}
+			break;
+
 		case 'h':
 			hflag = 1;
 			break;
@@ -127,7 +145,7 @@ main(int argc, char **argv)
 
 		default:
 			(void)fprintf(stderr,
-			    "usage: %s [-bdhrp] [-M core] [-N system]\n",
+			    "usage: %s [-bdhrp] [-c cpuid] [-M core] [-N system]\n",
 			    getprogname());
 			exit(1);
 		}
@@ -146,21 +164,31 @@ main(int argc, char **argv)
 	}
 #endif
 	accessmode = openfiles(sys, kmemf, &kvmvars);
-	mode = getprof(&kvmvars);
-	if (hflag)
-		disp = GMON_PROF_OFF;
-	else if (bflag)
-		disp = GMON_PROF_ON;
-	else
-		disp = mode;
-	if (pflag)
-		dumpstate(&kvmvars);
-	if (rflag)
-		reset(&kvmvars);
-	if (accessmode == O_RDWR)
-		setprof(&kvmvars, disp);
-	(void)fprintf(stdout, "%s: kernel profiling is %s.\n",
-	     getprogname(), disp == GMON_PROF_OFF ? "off" : "running");
+
+	do {
+		if (cpuid == -1)
+			on_cpu[0] = '\0';
+		else
+			snprintf(on_cpu, sizeof(on_cpu), " on cpu%d", cpuid);
+
+		mode = getprof(&kvmvars, cpuid);
+		if (hflag)
+			disp = GMON_PROF_OFF;
+		else if (bflag)
+			disp = GMON_PROF_ON;
+		else
+			disp = mode;
+		if (pflag)
+			dumpstate(&kvmvars, cpuid);
+		if (rflag)
+			reset(&kvmvars, cpuid);
+		if (accessmode == O_RDWR)
+			setprof(&kvmvars, disp, cpuid);
+		(void)fprintf(stdout, "%s: kernel profiling is %s%s.\n",
+		    getprogname(), disp == GMON_PROF_OFF ? "off" : "running",
+		    on_cpu);
+
+	} while (cflag_all && ++cpuid < ncpu);
 	return (0);
 }
 
@@ -233,9 +261,9 @@ kern_readonly(int mode)
  * Get the state of kernel profiling.
  */
 static int
-getprof(struct kvmvars *kvp)
+getprof(struct kvmvars *kvp, int cpuid)
 {
-	int mib[3];
+	int mib[5], miblen, mibparam;
 	size_t size;
 
 	if (kflag) {
@@ -244,9 +272,18 @@ getprof(struct kvmvars *kvp)
 	} else {
 		mib[0] = CTL_KERN;
 		mib[1] = KERN_PROF;
-		mib[2] = GPROF_GMONPARAM;
+		if (cpuid < 0) {
+			mibparam = 2;
+			miblen = 3;
+		} else {
+			mib[2] = GPROF_PERCPU;
+			mib[3] = cpuid;
+			mibparam = 4;
+			miblen = 5;
+		}
+		mib[mibparam] = GPROF_GMONPARAM;
 		size = sizeof kvp->gpm;
-		if (sysctl(mib, 3, &kvp->gpm, &size, NULL, 0) < 0)
+		if (sysctl(mib, miblen, &kvp->gpm, &size, NULL, 0) < 0)
 			size = 0;
 	}
 	if (size != sizeof kvp->gpm)
@@ -259,23 +296,32 @@ getprof(struct kvmvars *kvp)
  * Enable or disable kernel profiling according to the state variable.
  */
 static void
-setprof(struct kvmvars *kvp, int state)
+setprof(struct kvmvars *kvp, int state, int cpuid)
 {
 	struct gmonparam *p = (struct gmonparam *)nl[N_GMONPARAM].n_value;
-	int mib[3], oldstate;
+	int mib[5], miblen, mibparam, oldstate;
 	size_t sz;
 
 	sz = sizeof(state);
 	if (!kflag) {
 		mib[0] = CTL_KERN;
 		mib[1] = KERN_PROF;
-		mib[2] = GPROF_STATE;
-		if (sysctl(mib, 3, &oldstate, &sz, NULL, 0) < 0)
+		if (cpuid < 0) {
+			mibparam = 2;
+			miblen = 3;
+		} else {
+			mib[2] = GPROF_PERCPU;
+			mib[3] = cpuid;
+			mibparam = 4;
+			miblen = 5;
+		}
+		mib[mibparam] = GPROF_STATE;
+		if (sysctl(mib, miblen, &oldstate, &sz, NULL, 0) < 0)
 			goto bad;
 		if (oldstate == state)
 			return;
 		(void)seteuid(0);
-		if (sysctl(mib, 3, NULL, NULL, &state, sz) >= 0) {
+		if (sysctl(mib, miblen, NULL, NULL, &state, sz) >= 0) {
 			(void)seteuid(getuid());
 			return;
 		}
@@ -292,23 +338,41 @@ bad:
  * Build the gmon.out file.
  */
 static void
-dumpstate(struct kvmvars *kvp)
+dumpstate(struct kvmvars *kvp, int cpuid)
 {
 	FILE *fp;
 	struct rawarc rawarc;
 	struct tostruct *tos;
 	u_long frompc;
 	u_short *froms, *tickbuf;
-	int mib[3];
+	int mib[5], miblen, mibparam;
 	size_t i;
 	struct gmonhdr h;
 	int fromindex, endfrom, toindex;
 	size_t kcountsize;
+	char gmon_out[sizeof("gmon-XXXXXXXXXXX.out")];
 
-	setprof(kvp, GMON_PROF_OFF);
-	fp = fopen("gmon.out", "w");
+	mib[0] = CTL_KERN;
+	mib[1] = KERN_PROF;
+	if (cpuid < 0) {
+		mibparam = 2;
+		miblen = 3;
+	} else {
+		mib[2] = GPROF_PERCPU;
+		mib[3] = cpuid;
+		mibparam = 4;
+		miblen = 5;
+	}
+
+	setprof(kvp, GMON_PROF_OFF, cpuid);
+	if (cpuid < 0)
+		strlcpy(gmon_out, "gmon.out", sizeof(gmon_out));
+	else
+		snprintf(gmon_out, sizeof(gmon_out), "gmon-%d.out", cpuid);
+
+	fp = fopen(gmon_out, "w");
 	if (fp == NULL) {
-		warn("cannot open `gmon.out'");
+		warn("cannot open `%s'", gmon_out);
 		return;
 	}
 
@@ -329,8 +393,6 @@ dumpstate(struct kvmvars *kvp)
 	/*
 	 * Write out the tick buffer.
 	 */
-	mib[0] = CTL_KERN;
-	mib[1] = KERN_PROF;
 	if ((tickbuf = malloc(kcountsize)) == NULL)
 		err(EXIT_FAILURE, "Cannot allocate %zu kcount space",
 		    kcountsize);
@@ -338,9 +400,9 @@ dumpstate(struct kvmvars *kvp)
 		i = kvm_read(kvp->kd, (u_long)kvp->gpm.kcount, tickbuf,
 		    kcountsize);
 	} else {
-		mib[2] = GPROF_COUNT;
+		mib[mibparam] = GPROF_COUNT;
 		i = kcountsize;
-		if (sysctl(mib, 3, tickbuf, &i, NULL, 0) < 0)
+		if (sysctl(mib, miblen, tickbuf, &i, NULL, 0) < 0)
 			i = 0;
 	}
 	if (i != kcountsize)
@@ -361,9 +423,9 @@ dumpstate(struct kvmvars *kvp)
 		i = kvm_read(kvp->kd, (u_long)kvp->gpm.froms, froms,
 		    (size_t)kvp->gpm.fromssize);
 	} else {
-		mib[2] = GPROF_FROMS;
+		mib[mibparam] = GPROF_FROMS;
 		i = kvp->gpm.fromssize;
-		if (sysctl(mib, 3, froms, &i, NULL, 0) < 0)
+		if (sysctl(mib, miblen, froms, &i, NULL, 0) < 0)
 			i = 0;
 	}
 	if (i != kvp->gpm.fromssize)
@@ -377,9 +439,9 @@ dumpstate(struct kvmvars *kvp)
 		i = kvm_read(kvp->kd, (u_long)kvp->gpm.tos, (void *)tos,
 		    (size_t)kvp->gpm.tossize);
 	} else {
-		mib[2] = GPROF_TOS;
+		mib[mibparam] = GPROF_TOS;
 		i = kvp->gpm.tossize;
-		if (sysctl(mib, 3, tos, &i, NULL, 0) < 0)
+		if (sysctl(mib, miblen, tos, &i, NULL, 0) < 0)
 			i = 0;
 	}
 	if (i != kvp->gpm.tossize)
@@ -445,13 +507,25 @@ getprofhz(struct kvmvars *kvp)
  * Reset the kernel profiling date structures.
  */
 static void
-reset(struct kvmvars *kvp)
+reset(struct kvmvars *kvp, int cpuid)
 {
 	char *zbuf;
 	size_t biggest;
-	int mib[3];
+	int mib[5], miblen, mibparam;
 
-	setprof(kvp, GMON_PROF_OFF);
+	mib[0] = CTL_KERN;
+	mib[1] = KERN_PROF;
+	if (cpuid < 0) {
+		mibparam = 2;
+		miblen = 3;
+	} else {
+		mib[2] = GPROF_PERCPU;
+		mib[3] = cpuid;
+		mibparam = 4;
+		miblen = 5;
+	}
+
+	setprof(kvp, GMON_PROF_OFF, cpuid);
 
 	biggest = (size_t)kvp->gpm.kcountsize;
 	if ((size_t)kvp->gpm.fromssize > biggest)
@@ -477,16 +551,14 @@ reset(struct kvmvars *kvp)
 		return;
 	}
 	(void)seteuid(0);
-	mib[0] = CTL_KERN;
-	mib[1] = KERN_PROF;
-	mib[2] = GPROF_COUNT;
-	if (sysctl(mib, 3, NULL, NULL, zbuf, (size_t)kvp->gpm.kcountsize) < 0)
+	mib[mibparam] = GPROF_COUNT;
+	if (sysctl(mib, miblen, NULL, NULL, zbuf, (size_t)kvp->gpm.kcountsize) < 0)
 		err(EXIT_FAILURE, "tickbuf zero");
-	mib[2] = GPROF_FROMS;
-	if (sysctl(mib, 3, NULL, NULL, zbuf, (size_t)kvp->gpm.fromssize) < 0)
+	mib[mibparam] = GPROF_FROMS;
+	if (sysctl(mib, miblen, NULL, NULL, zbuf, (size_t)kvp->gpm.fromssize) < 0)
 		err(EXIT_FAILURE, "froms zero");
-	mib[2] = GPROF_TOS;
-	if (sysctl(mib, 3, NULL, NULL, zbuf, (size_t)kvp->gpm.tossize) < 0)
+	mib[mibparam] = GPROF_TOS;
+	if (sysctl(mib, miblen, NULL, NULL, zbuf, (size_t)kvp->gpm.tossize) < 0)
 		err(EXIT_FAILURE, "tos zero");
 	(void)seteuid(getuid());
 	free(zbuf);

Reply via email to