Module Name:    src
Committed By:   ryo
Date:           Thu Dec  1 00:32:52 UTC 2022

Modified Files:
        src/sys/arch/aarch64/include: armreg.h
        src/sys/dev/tprof: tprof.c tprof.h tprof_armv7.c tprof_armv8.c
            tprof_ioctl.h tprof_types.h tprof_x86.c tprof_x86_amd.c
            tprof_x86_intel.c
        src/usr.sbin/tprof: tprof.8 tprof.c tprof_analyze.c

Log Message:
Improve tprof(4)

- Multiple events can now be handled simultaneously.
- Counters should be configured with TPROF_IOC_CONFIGURE_EVENT in advance,
  instead of being configured at TPROF_IOC_START.
- The configured counters can be started and stopped repeatedly by
  PROF_IOC_START/TPROF_IOC_STOP.
- The value of the performance counter can be obtained at any timing as a 64bit
  value with TPROF_IOC_GETCOUNTS.
- Backend common parts are handled in tprof.c as much as possible, and functions
  on the tprof_backend side have been reimplemented to be more primitive.
- The reset value of counter overflows for profiling can now be adjusted.
  It is calculated by default from the CPU clock (speed of cycle counter) and
  TPROF_HZ, but for some events the value may be too large to be sufficient for
  profiling. The event counter can be specified as a ratio to the default or as
  an absolute value when configuring the event counter.
- Due to overall changes, API and ABI have been changed. TPROF_VERSION and
  TPROF_BACKEND_VERSION were updated.


To generate a diff of this commit:
cvs rdiff -u -r1.62 -r1.63 src/sys/arch/aarch64/include/armreg.h
cvs rdiff -u -r1.18 -r1.19 src/sys/dev/tprof/tprof.c
cvs rdiff -u -r1.6 -r1.7 src/sys/dev/tprof/tprof.h
cvs rdiff -u -r1.9 -r1.10 src/sys/dev/tprof/tprof_armv7.c
cvs rdiff -u -r1.17 -r1.18 src/sys/dev/tprof/tprof_armv8.c
cvs rdiff -u -r1.4 -r1.5 src/sys/dev/tprof/tprof_ioctl.h \
    src/sys/dev/tprof/tprof_x86_intel.c
cvs rdiff -u -r1.5 -r1.6 src/sys/dev/tprof/tprof_types.h \
    src/sys/dev/tprof/tprof_x86_amd.c
cvs rdiff -u -r1.1 -r1.2 src/sys/dev/tprof/tprof_x86.c
cvs rdiff -u -r1.16 -r1.17 src/usr.sbin/tprof/tprof.8
cvs rdiff -u -r1.13 -r1.14 src/usr.sbin/tprof/tprof.c
cvs rdiff -u -r1.5 -r1.6 src/usr.sbin/tprof/tprof_analyze.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/aarch64/include/armreg.h
diff -u src/sys/arch/aarch64/include/armreg.h:1.62 src/sys/arch/aarch64/include/armreg.h:1.63
--- src/sys/arch/aarch64/include/armreg.h:1.62	Thu Dec  1 00:29:10 2022
+++ src/sys/arch/aarch64/include/armreg.h	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/* $NetBSD: armreg.h,v 1.62 2022/12/01 00:29:10 ryo Exp $ */
+/* $NetBSD: armreg.h,v 1.63 2022/12/01 00:32:52 ryo Exp $ */
 
 /*-
  * Copyright (c) 2014 The NetBSD Foundation, Inc.
@@ -248,6 +248,10 @@ AARCH64REG_READ_INLINE(id_aa64dfr0_el1)
 #define	 ID_AA64DFR0_EL1_PMUVER_NONE	 0
 #define	 ID_AA64DFR0_EL1_PMUVER_V3	 1
 #define	 ID_AA64DFR0_EL1_PMUVER_NOV3	 2
+#define	 ID_AA64DFR0_EL1_PMUVER_V3P1	 4
+#define	 ID_AA64DFR0_EL1_PMUVER_V3P4	 5
+#define	 ID_AA64DFR0_EL1_PMUVER_V3P5	 6
+#define	 ID_AA64DFR0_EL1_PMUVER_V3P7	 7
 #define	 ID_AA64DFR0_EL1_PMUVER_IMPL	 15
 #define	ID_AA64DFR0_EL1_TRACEVER	__BITS(4,7)
 #define	 ID_AA64DFR0_EL1_TRACEVER_NONE	 0
@@ -1221,6 +1225,7 @@ AARCH64REG_WRITE_INLINE(pmcr_el0)
 #define	PMCR_IMP		__BITS(31,24)	// Implementor code
 #define	PMCR_IDCODE		__BITS(23,16)	// Identification code
 #define	PMCR_N			__BITS(15,11)	// Number of event counters
+#define	PMCR_LP			__BIT(7)	// Long event counter enable
 #define	PMCR_LC			__BIT(6)	// Long cycle counter enable
 #define	PMCR_DP			__BIT(5)	// Disable cycle counter when event
 						// counting is prohibited

Index: src/sys/dev/tprof/tprof.c
diff -u src/sys/dev/tprof/tprof.c:1.18 src/sys/dev/tprof/tprof.c:1.19
--- src/sys/dev/tprof/tprof.c:1.18	Thu Dec  1 00:27:59 2022
+++ src/sys/dev/tprof/tprof.c	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: tprof.c,v 1.18 2022/12/01 00:27:59 ryo Exp $	*/
+/*	$NetBSD: tprof.c,v 1.19 2022/12/01 00:32:52 ryo Exp $	*/
 
 /*-
  * Copyright (c)2008,2009,2010 YAMAMOTO Takashi,
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tprof.c,v 1.18 2022/12/01 00:27:59 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tprof.c,v 1.19 2022/12/01 00:32:52 ryo Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -42,12 +42,17 @@ __KERNEL_RCSID(0, "$NetBSD: tprof.c,v 1.
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/workqueue.h>
+#include <sys/xcall.h>
 
 #include <dev/tprof/tprof.h>
 #include <dev/tprof/tprof_ioctl.h>
 
 #include "ioconf.h"
 
+#ifndef TPROF_HZ
+#define TPROF_HZ	10000
+#endif
+
 /*
  * locking order:
  *	tprof_reader_lock -> tprof_lock
@@ -73,7 +78,7 @@ typedef struct tprof_buf {
 } tprof_buf_t;
 #define	TPROF_BUF_BYTESIZE(sz) \
 	(sizeof(tprof_buf_t) + (sz) * sizeof(tprof_sample_t))
-#define	TPROF_MAX_SAMPLES_PER_BUF	10000
+#define	TPROF_MAX_SAMPLES_PER_BUF	(TPROF_HZ * 2)
 
 #define	TPROF_MAX_BUF			100
 
@@ -85,14 +90,20 @@ typedef struct {
 } __aligned(CACHE_LINE_SIZE) tprof_cpu_t;
 
 typedef struct tprof_backend {
+	/*
+	 * tprof_backend_softc_t must be passed as an argument to the interrupt
+	 * handler, but since this is difficult to implement in armv7/v8. Then,
+	 * tprof_backend is exposed. Additionally, softc must be placed at the
+	 * beginning of struct tprof_backend.
+	 */
+	tprof_backend_softc_t tb_softc;
+
 	const char *tb_name;
 	const tprof_backend_ops_t *tb_ops;
 	LIST_ENTRY(tprof_backend) tb_list;
-	int tb_usecount;	/* S: */
 } tprof_backend_t;
 
 static kmutex_t tprof_lock;
-static bool tprof_running;		/* s: */
 static u_int tprof_nworker;		/* L: # of running worker LWPs */
 static lwp_t *tprof_owner;
 static STAILQ_HEAD(, tprof_buf) tprof_list; /* L: global buffer list */
@@ -101,7 +112,7 @@ static struct workqueue *tprof_wq;
 static struct percpu *tprof_cpus __read_mostly;	/* tprof_cpu_t * */
 static u_int tprof_samples_per_buf;
 
-static tprof_backend_t *tprof_backend;	/* S: */
+tprof_backend_t *tprof_backend;	/* S: */
 static LIST_HEAD(, tprof_backend) tprof_backends =
     LIST_HEAD_INITIALIZER(tprof_backend); /* S: */
 
@@ -193,6 +204,7 @@ tprof_worker(struct work *wk, void *dumm
 {
 	tprof_cpu_t * const c = tprof_curcpu();
 	tprof_buf_t *buf;
+	tprof_backend_t *tb;
 	bool shouldstop;
 
 	KASSERT(wk == &c->c_work);
@@ -207,7 +219,8 @@ tprof_worker(struct work *wk, void *dumm
 	 * and put it on the global list for read(2).
 	 */
 	mutex_enter(&tprof_lock);
-	shouldstop = !tprof_running;
+	tb = tprof_backend;
+	shouldstop = (tb == NULL || tb->tb_softc.sc_ctr_running_mask == 0);
 	if (shouldstop) {
 		KASSERT(tprof_nworker > 0);
 		tprof_nworker--;
@@ -283,103 +296,352 @@ tprof_getinfo(struct tprof_info *info)
 }
 
 static int
-tprof_start(const tprof_param_t *param)
+tprof_getncounters(u_int *ncounters)
+{
+	tprof_backend_t *tb;
+
+	tb = tprof_backend;
+	if (tb == NULL)
+		return ENOENT;
+
+	*ncounters = tb->tb_ops->tbo_ncounters();
+	return 0;
+}
+
+static void
+tprof_start_cpu(void *arg1, void *arg2)
+{
+	tprof_backend_t *tb = arg1;
+	tprof_countermask_t runmask = (uintptr_t)arg2;
+
+	tb->tb_ops->tbo_start(runmask);
+}
+
+static void
+tprof_stop_cpu(void *arg1, void *arg2)
+{
+	tprof_backend_t *tb = arg1;
+	tprof_countermask_t stopmask = (uintptr_t)arg2;
+
+	tb->tb_ops->tbo_stop(stopmask);
+}
+
+static int
+tprof_start(tprof_countermask_t runmask)
 {
 	CPU_INFO_ITERATOR cii;
 	struct cpu_info *ci;
-	int error;
-	uint64_t freq;
 	tprof_backend_t *tb;
+	uint64_t xc;
+	int error;
+	bool firstrun;
 
 	KASSERT(mutex_owned(&tprof_startstop_lock));
-	if (tprof_running) {
-		error = EBUSY;
-		goto done;
-	}
 
 	tb = tprof_backend;
 	if (tb == NULL) {
 		error = ENOENT;
 		goto done;
 	}
-	if (tb->tb_usecount > 0) {
-		error = EBUSY;
+
+	runmask &= ~tb->tb_softc.sc_ctr_running_mask;
+	runmask &= tb->tb_softc.sc_ctr_configured_mask;
+	if (runmask == 0) {
+		/*
+		 * targets are already running.
+		 * unconfigured counters are ignored.
+		 */
+		error = 0;
 		goto done;
 	}
 
-	tb->tb_usecount++;
-	freq = tb->tb_ops->tbo_estimate_freq();
-	tprof_samples_per_buf = MIN(freq * 2, TPROF_MAX_SAMPLES_PER_BUF);
-
-	error = workqueue_create(&tprof_wq, "tprofmv", tprof_worker, NULL,
-	    PRI_NONE, IPL_SOFTCLOCK, WQ_MPSAFE | WQ_PERCPU);
-	if (error != 0) {
-		goto done;
+	firstrun = (tb->tb_softc.sc_ctr_running_mask == 0);
+	if (firstrun) {
+		if (tb->tb_ops->tbo_establish != NULL) {
+			error = tb->tb_ops->tbo_establish(&tb->tb_softc);
+			if (error != 0)
+				goto done;
+		}
+
+		tprof_samples_per_buf = TPROF_MAX_SAMPLES_PER_BUF;
+		error = workqueue_create(&tprof_wq, "tprofmv", tprof_worker,
+		    NULL, PRI_NONE, IPL_SOFTCLOCK, WQ_MPSAFE | WQ_PERCPU);
+		if (error != 0) {
+			if (tb->tb_ops->tbo_disestablish != NULL)
+				tb->tb_ops->tbo_disestablish(&tb->tb_softc);
+			goto done;
+		}
+
+		for (CPU_INFO_FOREACH(cii, ci)) {
+			tprof_cpu_t * const c = tprof_cpu(ci);
+			tprof_buf_t *new;
+			tprof_buf_t *old;
+
+			new = tprof_buf_alloc();
+			old = tprof_buf_switch(c, new);
+			if (old != NULL) {
+				tprof_buf_free(old);
+			}
+			callout_init(&c->c_callout, CALLOUT_MPSAFE);
+			callout_setfunc(&c->c_callout, tprof_kick, ci);
+		}
 	}
 
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		tprof_cpu_t * const c = tprof_cpu(ci);
-		tprof_buf_t *new;
-		tprof_buf_t *old;
+	runmask &= tb->tb_softc.sc_ctr_configured_mask;
+	xc = xc_broadcast(0, tprof_start_cpu, tb, (void *)(uintptr_t)runmask);
+	xc_wait(xc);
+	mutex_enter(&tprof_lock);
+	tb->tb_softc.sc_ctr_running_mask |= runmask;
+	mutex_exit(&tprof_lock);
 
-		new = tprof_buf_alloc();
-		old = tprof_buf_switch(c, new);
-		if (old != NULL) {
-			tprof_buf_free(old);
+	if (firstrun) {
+		for (CPU_INFO_FOREACH(cii, ci)) {
+			tprof_cpu_t * const c = tprof_cpu(ci);
+
+			mutex_enter(&tprof_lock);
+			tprof_nworker++;
+			mutex_exit(&tprof_lock);
+			workqueue_enqueue(tprof_wq, &c->c_work, ci);
 		}
-		callout_init(&c->c_callout, CALLOUT_MPSAFE);
-		callout_setfunc(&c->c_callout, tprof_kick, ci);
 	}
+done:
+	return error;
+}
 
-	error = tb->tb_ops->tbo_start(param);
-	if (error != 0) {
-		KASSERT(tb->tb_usecount > 0);
-		tb->tb_usecount--;
-		tprof_stop1();
+static void
+tprof_stop(tprof_countermask_t stopmask)
+{
+	tprof_backend_t *tb;
+	uint64_t xc;
+
+	tb = tprof_backend;
+	if (tb == NULL)
+		return;
+
+	KASSERT(mutex_owned(&tprof_startstop_lock));
+	stopmask &= tb->tb_softc.sc_ctr_running_mask;
+	if (stopmask == 0) {
+		/* targets are not running */
 		goto done;
 	}
 
+	xc = xc_broadcast(0, tprof_stop_cpu, tb, (void *)(uintptr_t)stopmask);
+	xc_wait(xc);
 	mutex_enter(&tprof_lock);
-	tprof_running = true;
+	tb->tb_softc.sc_ctr_running_mask &= ~stopmask;
 	mutex_exit(&tprof_lock);
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		tprof_cpu_t * const c = tprof_cpu(ci);
 
+	/* all counters have stopped? */
+	if (tb->tb_softc.sc_ctr_running_mask == 0) {
 		mutex_enter(&tprof_lock);
-		tprof_nworker++;
+		cv_broadcast(&tprof_reader_cv);
+		while (tprof_nworker > 0) {
+			cv_wait(&tprof_cv, &tprof_lock);
+		}
 		mutex_exit(&tprof_lock);
-		workqueue_enqueue(tprof_wq, &c->c_work, ci);
+
+		tprof_stop1();
+		if (tb->tb_ops->tbo_disestablish != NULL)
+			tb->tb_ops->tbo_disestablish(&tb->tb_softc);
 	}
 done:
-	return error;
+	;
 }
 
 static void
-tprof_stop(void)
+tprof_init_percpu_counters_offset(void *vp, void *vp2, struct cpu_info *ci)
+{
+	uint64_t *counters_offset = vp;
+	u_int counter = (uintptr_t)vp2;
+
+	tprof_backend_t *tb = tprof_backend;
+	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
+	counters_offset[counter] = param->p_value;
+}
+
+static void
+tprof_configure_event_cpu(void *arg1, void *arg2)
+{
+	tprof_backend_t *tb = arg1;
+	u_int counter = (uintptr_t)arg2;
+	tprof_param_t *param = &tb->tb_softc.sc_count[counter].ctr_param;
+
+	tb->tb_ops->tbo_configure_event(counter, param);
+}
+
+static int
+tprof_configure_event(const tprof_param_t *param)
 {
 	tprof_backend_t *tb;
+	tprof_backend_softc_t *sc;
+	tprof_param_t *sc_param;
+	uint64_t xc;
+	int c, error;
 
-	KASSERT(mutex_owned(&tprof_startstop_lock));
-	if (!tprof_running) {
+	if ((param->p_flags & (TPROF_PARAM_USER | TPROF_PARAM_KERN)) == 0) {
+		error = EINVAL;
 		goto done;
 	}
 
 	tb = tprof_backend;
-	KASSERT(tb->tb_usecount > 0);
-	tb->tb_ops->tbo_stop(NULL);
-	tb->tb_usecount--;
+	if (tb == NULL) {
+		error = ENOENT;
+		goto done;
+	}
+	sc = &tb->tb_softc;
 
-	mutex_enter(&tprof_lock);
-	tprof_running = false;
-	cv_broadcast(&tprof_reader_cv);
-	while (tprof_nworker > 0) {
-		cv_wait(&tprof_cv, &tprof_lock);
+	c = param->p_counter;
+	if (c >= tb->tb_softc.sc_ncounters) {
+		error = EINVAL;
+		goto done;
+	}
+
+	if (tb->tb_ops->tbo_valid_event != NULL) {
+		error = tb->tb_ops->tbo_valid_event(param->p_counter, param);
+		if (error != 0)
+			goto done;
+	}
+
+	/* if already running, stop the counter */
+	if (ISSET(c, tb->tb_softc.sc_ctr_running_mask))
+		tprof_stop(__BIT(c));
+
+	sc->sc_count[c].ctr_bitwidth =
+	    tb->tb_ops->tbo_counter_bitwidth(param->p_counter);
+
+	sc_param = &sc->sc_count[c].ctr_param;
+	memcpy(sc_param, param, sizeof(*sc_param));	/* save copy of param */
+
+	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
+		uint64_t freq, inum, dnum;
+
+		freq = tb->tb_ops->tbo_counter_estimate_freq(c);
+		sc->sc_count[c].ctr_counter_val = freq / TPROF_HZ;
+		if (sc->sc_count[c].ctr_counter_val == 0) {
+			printf("%s: counter#%d frequency (%"PRIu64") is"
+			    " very low relative to TPROF_HZ (%u)\n", __func__,
+			    c, freq, TPROF_HZ);
+			sc->sc_count[c].ctr_counter_val =
+			    4000000000ULL / TPROF_HZ;
+		}
+
+		switch (param->p_flags & TPROF_PARAM_VALUE2_MASK) {
+		case TPROF_PARAM_VALUE2_SCALE:
+			if (sc_param->p_value2 == 0)
+				break;
+			/*
+			 * p_value2 is 64-bit fixed-point
+			 * upper 32 bits are the integer part
+			 * lower 32 bits are the decimal part
+			 */
+			inum = sc_param->p_value2 >> 32;
+			dnum = sc_param->p_value2 & __BITS(31, 0);
+			sc->sc_count[c].ctr_counter_val =
+			    sc->sc_count[c].ctr_counter_val * inum +
+			    (sc->sc_count[c].ctr_counter_val * dnum >> 32);
+			if (sc->sc_count[c].ctr_counter_val == 0)
+				sc->sc_count[c].ctr_counter_val = 1;
+			break;
+		case TPROF_PARAM_VALUE2_TRIGGERCOUNT:
+			if (sc_param->p_value2 == 0)
+				sc_param->p_value2 = 1;
+			if (sc_param->p_value2 >
+			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0)) {
+				sc_param->p_value2 =
+				    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
+			}
+			sc->sc_count[c].ctr_counter_val = sc_param->p_value2;
+			break;
+		default:
+			break;
+		}
+		sc->sc_count[c].ctr_counter_reset_val =
+		    -sc->sc_count[c].ctr_counter_val;
+		sc->sc_count[c].ctr_counter_reset_val &=
+		    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0);
+	} else {
+		sc->sc_count[c].ctr_counter_val = 0;
+		sc->sc_count[c].ctr_counter_reset_val = 0;
 	}
+
+	/* At this point, p_value is used as an initial value */
+	percpu_foreach(tb->tb_softc.sc_ctr_offset_percpu,
+	    tprof_init_percpu_counters_offset, (void *)(uintptr_t)c);
+	/* On the backend side, p_value is used as the reset value */
+	sc_param->p_value = tb->tb_softc.sc_count[c].ctr_counter_reset_val;
+
+	xc = xc_broadcast(0, tprof_configure_event_cpu,
+	    tb, (void *)(uintptr_t)c);
+	xc_wait(xc);
+
+	mutex_enter(&tprof_lock);
+	/* update counters bitmasks */
+	SET(tb->tb_softc.sc_ctr_configured_mask, __BIT(c));
+	CLR(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
+	CLR(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
+	/* profiled counter requires overflow handling */
+	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE)) {
+		SET(tb->tb_softc.sc_ctr_prof_mask, __BIT(c));
+		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
+	}
+	/* counters with less than 64bits also require overflow handling */
+	if (sc->sc_count[c].ctr_bitwidth != 64)
+		SET(tb->tb_softc.sc_ctr_ovf_mask, __BIT(c));
 	mutex_exit(&tprof_lock);
 
-	tprof_stop1();
-done:
-	;
+	error = 0;
+
+ done:
+	return error;
+}
+
+static void
+tprof_getcounts_cpu(void *arg1, void *arg2)
+{
+	tprof_backend_t *tb = arg1;
+	tprof_backend_softc_t *sc = &tb->tb_softc;
+	uint64_t *counters = arg2;
+	uint64_t *counters_offset;
+	unsigned int c;
+
+	tprof_countermask_t configmask = sc->sc_ctr_configured_mask;
+	counters_offset = percpu_getref(sc->sc_ctr_offset_percpu);
+	for (c = 0; c < sc->sc_ncounters; c++) {
+		if (ISSET(configmask, __BIT(c))) {
+			uint64_t ctr = tb->tb_ops->tbo_counter_read(c);
+			counters[c] = counters_offset[c] +
+			    ((ctr - sc->sc_count[c].ctr_counter_reset_val) &
+			    __BITS(sc->sc_count[c].ctr_bitwidth - 1, 0));
+		} else {
+			counters[c] = 0;
+		}
+	}
+	percpu_putref(sc->sc_ctr_offset_percpu);
+}
+
+static int
+tprof_getcounts(tprof_counts_t *counts)
+{
+	struct cpu_info *ci;
+	tprof_backend_t *tb;
+	uint64_t xc;
+
+	tb = tprof_backend;
+	if (tb == NULL)
+		return ENOENT;
+
+	if (counts->c_cpu >= ncpu)
+		return ESRCH;
+	ci = cpu_lookup(counts->c_cpu);
+	if (ci == NULL)
+		return ESRCH;
+
+	xc = xc_unicast(0, tprof_getcounts_cpu, tb, counts->c_count, ci);
+	xc_wait(xc);
+
+	counts->c_ncounters = tb->tb_softc.sc_ncounters;
+	counts->c_runningmask = tb->tb_softc.sc_ctr_running_mask;
+	return 0;
 }
 
 /*
@@ -457,7 +719,8 @@ tprof_sample(void *unused, const tprof_f
 	sp->s_pid = l->l_proc->p_pid;
 	sp->s_lwpid = l->l_lid;
 	sp->s_cpuid = c->c_cpuid;
-	sp->s_flags = (tfi->tfi_inkernel) ? TPROF_SAMPLE_INKERNEL : 0;
+	sp->s_flags = ((tfi->tfi_inkernel) ? TPROF_SAMPLE_INKERNEL : 0) |
+	    __SHIFTIN(tfi->tfi_counter, TPROF_SAMPLE_COUNTER_MASK);
 	sp->s_pc = pc;
 	buf->b_used = idx + 1;
 }
@@ -488,10 +751,9 @@ tprof_backend_register(const char *name,
 		return ENOTSUP;
 	}
 #endif
-	tb = kmem_alloc(sizeof(*tb), KM_SLEEP);
+	tb = kmem_zalloc(sizeof(*tb), KM_SLEEP);
 	tb->tb_name = name;
 	tb->tb_ops = ops;
-	tb->tb_usecount = 0;
 	LIST_INSERT_HEAD(&tprof_backends, tb, tb_list);
 #if 1 /* XXX for now */
 	if (tprof_backend == NULL) {
@@ -500,6 +762,13 @@ tprof_backend_register(const char *name,
 #endif
 	mutex_exit(&tprof_startstop_lock);
 
+	/* init backend softc */
+	tb->tb_softc.sc_ncounters = tb->tb_ops->tbo_ncounters();
+	tb->tb_softc.sc_ctr_offset_percpu_size =
+	    sizeof(uint64_t) * tb->tb_softc.sc_ncounters;
+	tb->tb_softc.sc_ctr_offset_percpu =
+	    percpu_alloc(tb->tb_softc.sc_ctr_offset_percpu_size);
+
 	return 0;
 }
 
@@ -520,7 +789,7 @@ tprof_backend_unregister(const char *nam
 		panic("%s: not found '%s'", __func__, name);
 	}
 #endif /* defined(DIAGNOSTIC) */
-	if (tb->tb_usecount > 0) {
+	if (tb->tb_softc.sc_ctr_running_mask != 0) {
 		mutex_exit(&tprof_startstop_lock);
 		return EBUSY;
 	}
@@ -532,6 +801,11 @@ tprof_backend_unregister(const char *nam
 	LIST_REMOVE(tb, tb_list);
 	mutex_exit(&tprof_startstop_lock);
 
+	/* fini backend softc */
+	percpu_free(tb->tb_softc.sc_ctr_offset_percpu,
+	    tb->tb_softc.sc_ctr_offset_percpu_size);
+
+	/* free backend */
 	kmem_free(tb, sizeof(*tb));
 
 	return 0;
@@ -567,8 +841,17 @@ tprof_close(dev_t dev, int flags, int ty
 	mutex_enter(&tprof_lock);
 	tprof_owner = NULL;
 	mutex_exit(&tprof_lock);
-	tprof_stop();
+	tprof_stop(TPROF_COUNTERMASK_ALL);
 	tprof_clear();
+
+	tprof_backend_t *tb = tprof_backend;
+	if (tb != NULL) {
+		KASSERT(tb->tb_softc.sc_ctr_running_mask == 0);
+		tb->tb_softc.sc_ctr_configured_mask = 0;
+		tb->tb_softc.sc_ctr_prof_mask = 0;
+		tb->tb_softc.sc_ctr_ovf_mask = 0;
+	}
+
 	mutex_exit(&tprof_startstop_lock);
 
 	return 0;
@@ -644,6 +927,7 @@ static int
 tprof_ioctl(dev_t dev, u_long cmd, void *data, int flags, struct lwp *l)
 {
 	const tprof_param_t *param;
+	tprof_counts_t *counts;
 	int error = 0;
 
 	KASSERT(minor(dev) == 0);
@@ -654,15 +938,19 @@ tprof_ioctl(dev_t dev, u_long cmd, void 
 		tprof_getinfo(data);
 		mutex_exit(&tprof_startstop_lock);
 		break;
+	case TPROF_IOC_GETNCOUNTERS:
+		mutex_enter(&tprof_lock);
+		error = tprof_getncounters((u_int *)data);
+		mutex_exit(&tprof_lock);
+		break;
 	case TPROF_IOC_START:
-		param = data;
 		mutex_enter(&tprof_startstop_lock);
-		error = tprof_start(param);
+		error = tprof_start(*(tprof_countermask_t *)data);
 		mutex_exit(&tprof_startstop_lock);
 		break;
 	case TPROF_IOC_STOP:
 		mutex_enter(&tprof_startstop_lock);
-		tprof_stop();
+		tprof_stop(*(tprof_countermask_t *)data);
 		mutex_exit(&tprof_startstop_lock);
 		break;
 	case TPROF_IOC_GETSTAT:
@@ -670,6 +958,18 @@ tprof_ioctl(dev_t dev, u_long cmd, void 
 		memcpy(data, &tprof_stat, sizeof(tprof_stat));
 		mutex_exit(&tprof_lock);
 		break;
+	case TPROF_IOC_CONFIGURE_EVENT:
+		param = data;
+		mutex_enter(&tprof_startstop_lock);
+		error = tprof_configure_event(param);
+		mutex_exit(&tprof_startstop_lock);
+		break;
+	case TPROF_IOC_GETCOUNTS:
+		counts = data;
+		mutex_enter(&tprof_startstop_lock);
+		error = tprof_getcounts(counts);
+		mutex_exit(&tprof_startstop_lock);
+		break;
 	default:
 		error = EINVAL;
 		break;

Index: src/sys/dev/tprof/tprof.h
diff -u src/sys/dev/tprof/tprof.h:1.6 src/sys/dev/tprof/tprof.h:1.7
--- src/sys/dev/tprof/tprof.h:1.6	Fri Jul 13 07:56:29 2018
+++ src/sys/dev/tprof/tprof.h	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: tprof.h,v 1.6 2018/07/13 07:56:29 maxv Exp $	*/
+/*	$NetBSD: tprof.h,v 1.7 2022/12/01 00:32:52 ryo Exp $	*/
 
 /*-
  * Copyright (c)2008,2009,2010 YAMAMOTO Takashi,
@@ -37,19 +37,45 @@
 
 #include <dev/tprof/tprof_types.h>
 
+struct tprof_backend_softc_counter {
+	tprof_param_t ctr_param;
+	u_int ctr_bitwidth;
+	uint64_t ctr_counter_val;
+	uint64_t ctr_counter_reset_val;
+};
+
+typedef struct tprof_backend_softc {
+	u_int sc_ncounters;
+	tprof_countermask_t sc_ctr_running_mask;/* start/stop */
+	tprof_countermask_t sc_ctr_configured_mask;	/* configured */
+	tprof_countermask_t sc_ctr_ovf_mask;	/* overflow intr required */
+	tprof_countermask_t sc_ctr_prof_mask;	/* profiled */
+	percpu_t *sc_ctr_offset_percpu;
+	size_t sc_ctr_offset_percpu_size;
+	struct tprof_backend_softc_counter sc_count[TPROF_MAXCOUNTERS];
+} tprof_backend_softc_t;
+
 typedef struct tprof_backend_ops {
-	uint64_t (*tbo_estimate_freq)(void);	/* samples per second */
 	uint32_t (*tbo_ident)(void);
-	int (*tbo_start)(const tprof_param_t *);
-	void (*tbo_stop)(const tprof_param_t *);
+	u_int (*tbo_ncounters)(void);
+	u_int (*tbo_counter_bitwidth)(u_int);
+	uint64_t (*tbo_counter_read)(u_int);
+	uint64_t (*tbo_counter_estimate_freq)(u_int);
+	int (*tbo_valid_event)(u_int, const tprof_param_t *);
+	void (*tbo_configure_event)(u_int, const tprof_param_t *);
+	void (*tbo_start)(tprof_countermask_t);
+	void (*tbo_stop)(tprof_countermask_t);
+	int (*tbo_establish)(tprof_backend_softc_t *);
+	void (*tbo_disestablish)(tprof_backend_softc_t *);
 } tprof_backend_ops_t;
 
-#define	TPROF_BACKEND_VERSION	3
+#define	TPROF_BACKEND_VERSION	4
 int tprof_backend_register(const char *, const tprof_backend_ops_t *, int);
 int tprof_backend_unregister(const char *);
 
 typedef struct {
 	uintptr_t tfi_pc;	/* program counter */
+	u_int tfi_counter;	/* counter. 0..(TPROF_MAXCOUNTERS-1) */
 	bool tfi_inkernel;	/* if tfi_pc is in the kernel address space */
 } tprof_frame_info_t;
 

Index: src/sys/dev/tprof/tprof_armv7.c
diff -u src/sys/dev/tprof/tprof_armv7.c:1.9 src/sys/dev/tprof/tprof_armv7.c:1.10
--- src/sys/dev/tprof/tprof_armv7.c:1.9	Thu Dec  1 00:29:51 2022
+++ src/sys/dev/tprof/tprof_armv7.c	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/* $NetBSD: tprof_armv7.c,v 1.9 2022/12/01 00:29:51 ryo Exp $ */
+/* $NetBSD: tprof_armv7.c,v 1.10 2022/12/01 00:32:52 ryo Exp $ */
 
 /*-
  * Copyright (c) 2018 Jared McNeill <jmcne...@invisible.ca>
@@ -27,11 +27,12 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tprof_armv7.c,v 1.9 2022/12/01 00:29:51 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tprof_armv7.c,v 1.10 2022/12/01 00:32:52 ryo Exp $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
+#include <sys/percpu.h>
 #include <sys/xcall.h>
 
 #include <dev/tprof/tprof.h>
@@ -50,15 +51,13 @@ __KERNEL_RCSID(0, "$NetBSD: tprof_armv7.
 #define	PMCNTEN_C		__BIT(31)
 #define	PMCNTEN_P		__BITS(30,0)
 
+#define	PMOVS_C			__BIT(31)
+#define	PMOVS_P			__BITS(30,0)
+
 #define	PMEVTYPER_P		__BIT(31)
 #define	PMEVTYPER_U		__BIT(30)
 #define	PMEVTYPER_EVTCOUNT	__BITS(7,0)
 
-static tprof_param_t armv7_pmu_param;
-static const u_int armv7_pmu_counter = 1;
-static uint32_t counter_val;
-static uint32_t counter_reset_val;
-
 static uint16_t cortexa9_events[] = {
 	0x40, 0x41, 0x42,
 	0x50, 0x51,
@@ -118,7 +117,7 @@ armv7_pmu_set_pmevtyper(u_int counter, u
 	armreg_pmxevtyper_write(val);
 }
 
-static void
+static inline void
 armv7_pmu_set_pmevcntr(u_int counter, uint32_t val)
 {
 	armreg_pmselr_write(counter);
@@ -126,138 +125,175 @@ armv7_pmu_set_pmevcntr(u_int counter, ui
 	armreg_pmxevcntr_write(val);
 }
 
-static void
-armv7_pmu_start_cpu(void *arg1, void *arg2)
+static inline uint64_t
+armv7_pmu_get_pmevcntr(u_int counter)
 {
-	const uint32_t counter_mask = __BIT(armv7_pmu_counter);
-	uint64_t pmcr, pmevtyper;
-
-	/* Enable performance monitor */
-	pmcr = armreg_pmcr_read();
-	pmcr |= PMCR_E;
-	armreg_pmcr_write(pmcr);
-
-	/* Disable event counter */
-	armreg_pmcntenclr_write(counter_mask);
-
-	/* Configure event counter */
-	pmevtyper = __SHIFTIN(armv7_pmu_param.p_event, PMEVTYPER_EVTCOUNT);
-	if (!ISSET(armv7_pmu_param.p_flags, TPROF_PARAM_USER))
-		pmevtyper |= PMEVTYPER_U;
-	if (!ISSET(armv7_pmu_param.p_flags, TPROF_PARAM_KERN))
-		pmevtyper |= PMEVTYPER_P;
-
-	armv7_pmu_set_pmevtyper(armv7_pmu_counter, pmevtyper);
-
-	/* Enable overflow interrupts */
-	armreg_pmintenset_write(counter_mask);
-
-	/* Clear overflow flag */
-	armreg_pmovsr_write(counter_mask);
+	armreg_pmselr_write(counter);
+	isb();
+	return armreg_pmxevcntr_read();
+}
 
-	/* Initialize event counter value */
-	armv7_pmu_set_pmevcntr(armv7_pmu_counter, counter_reset_val);
+/* read and write at once */
+static inline uint64_t
+armv7_pmu_getset_pmevcntr(u_int counter, uint64_t val)
+{
+	uint64_t c;
 
-	/* Enable event counter */
-	armreg_pmcntenset_write(counter_mask);
+	armreg_pmselr_write(counter);
+	isb();
+	c = armreg_pmxevcntr_read();
+	armreg_pmxevcntr_write(val);
+	return c;
 }
 
-static void
-armv7_pmu_stop_cpu(void *arg1, void *arg2)
+static uint32_t
+armv7_pmu_ncounters(void)
 {
-	const uint32_t counter_mask = __BIT(armv7_pmu_counter);
-
-	/* Disable overflow interrupts */
-	armreg_pmintenclr_write(counter_mask);
+	return __SHIFTOUT(armreg_pmcr_read(), PMCR_N);
+}
 
-	/* Disable event counter */
-	armreg_pmcntenclr_write(counter_mask);
+static u_int
+armv7_pmu_counter_bitwidth(u_int counter)
+{
+	return 32;
 }
 
 static uint64_t
-armv7_pmu_estimate_freq(void)
+armv7_pmu_counter_estimate_freq(u_int counter)
 {
 	uint64_t cpufreq = curcpu()->ci_data.cpu_cc_freq;
-	uint64_t freq = 10000;
-	uint32_t pmcr;
-
-	counter_val = cpufreq / freq;
-	if (counter_val == 0)
-		counter_val = 4000000000ULL / freq;
-
-	pmcr = armreg_pmcr_read();
-	if (pmcr & PMCR_D)
-		counter_val /= 64;
 
-	return freq;
-}
-
-static uint32_t
-armv7_pmu_ident(void)
-{
-	return TPROF_IDENT_ARMV7_GENERIC;
+	if (ISSET(armreg_pmcr_read(), PMCR_D))
+		cpufreq /= 64;
+	return cpufreq;
 }
 
 static int
-armv7_pmu_start(const tprof_param_t *param)
+armv7_pmu_valid_event(u_int counter, const tprof_param_t *param)
 {
-	/* PMCR.N of 0 means that no event counters are available */
-	if (__SHIFTOUT(armreg_pmcr_read(), PMCR_N) == 0) {
-		return EINVAL;
-	}
-
 	if (!armv7_pmu_event_implemented(param->p_event)) {
-		printf("%s: event %#llx not implemented on this CPU\n",
+		printf("%s: event %#" PRIx64 " not implemented on this CPU\n",
 		    __func__, param->p_event);
 		return EINVAL;
 	}
+	return 0;
+}
 
-	counter_reset_val = -counter_val + 1;
+static void
+armv7_pmu_configure_event(u_int counter, const tprof_param_t *param)
+{
+	/* Disable event counter */
+	armreg_pmcntenclr_write(__BIT(counter) & PMCNTEN_P);
 
-	armv7_pmu_param = *param;
-	uint64_t xc = xc_broadcast(0, armv7_pmu_start_cpu, NULL, NULL);
-	xc_wait(xc);
+	/* Disable overflow interrupts */
+	armreg_pmintenclr_write(__BIT(counter) & PMINTEN_P);
 
-	return 0;
+	/* Configure event counter */
+	uint32_t pmevtyper = __SHIFTIN(param->p_event, PMEVTYPER_EVTCOUNT);
+	if (!ISSET(param->p_flags, TPROF_PARAM_USER))
+		pmevtyper |= PMEVTYPER_U;
+	if (!ISSET(param->p_flags, TPROF_PARAM_KERN))
+		pmevtyper |= PMEVTYPER_P;
+	armv7_pmu_set_pmevtyper(counter, pmevtyper);
+
+	/*
+	 * Enable overflow interrupts.
+	 * Whether profiled or not, the counter width of armv7 is 32 bits,
+	 * so overflow handling is required anyway.
+	 */
+	armreg_pmintenset_write(__BIT(counter) & PMINTEN_P);
+
+	/* Clear overflow flag */
+	armreg_pmovsr_write(__BIT(counter) & PMOVS_P);
+
+	/* reset the counter */
+	armv7_pmu_set_pmevcntr(counter, param->p_value);
 }
 
 static void
-armv7_pmu_stop(const tprof_param_t *param)
+armv7_pmu_start(tprof_countermask_t runmask)
 {
-	uint64_t xc;
+	/* Enable event counters */
+	armreg_pmcntenset_write(runmask & PMCNTEN_P);
 
-	xc = xc_broadcast(0, armv7_pmu_stop_cpu, NULL, NULL);
-	xc_wait(xc);
+	/*
+	 * PMCR.E is shared with PMCCNTR and event counters.
+	 * It is set here in case PMCCNTR is not used in the system.
+	 */
+	armreg_pmcr_write(armreg_pmcr_read() | PMCR_E);
 }
 
-static const tprof_backend_ops_t tprof_armv7_pmu_ops = {
-	.tbo_estimate_freq = armv7_pmu_estimate_freq,
-	.tbo_ident = armv7_pmu_ident,
-	.tbo_start = armv7_pmu_start,
-	.tbo_stop = armv7_pmu_stop,
-};
+static void
+armv7_pmu_stop(tprof_countermask_t stopmask)
+{
+	/* Disable event counter */
+	armreg_pmcntenclr_write(stopmask & PMCNTEN_P);
+}
+
+/* XXX: argument of armv8_pmu_intr() */
+extern struct tprof_backend *tprof_backend;
+static void *pmu_intr_arg;
 
 int
 armv7_pmu_intr(void *priv)
 {
 	const struct trapframe * const tf = priv;
-	const uint32_t counter_mask = __BIT(armv7_pmu_counter);
+	tprof_backend_softc_t *sc = pmu_intr_arg;
 	tprof_frame_info_t tfi;
+	int bit;
+	const uint32_t pmovs = armreg_pmovsr_read() & PMOVS_P;
 
-	const uint32_t pmovsr = armreg_pmovsr_read();
-	if ((pmovsr & counter_mask) != 0) {
-		tfi.tfi_pc = tf->tf_pc;
-		tfi.tfi_inkernel = tfi.tfi_pc >= VM_MIN_KERNEL_ADDRESS &&
-		    tfi.tfi_pc < VM_MAX_KERNEL_ADDRESS;
-		tprof_sample(NULL, &tfi);
-
-		armv7_pmu_set_pmevcntr(armv7_pmu_counter, counter_reset_val);
+	uint64_t *counters_offset =
+	    percpu_getptr_remote(sc->sc_ctr_offset_percpu, curcpu());
+	uint32_t mask = pmovs;
+	while ((bit = ffs(mask)) != 0) {
+		bit--;
+		CLR(mask, __BIT(bit));
+
+		if (ISSET(sc->sc_ctr_prof_mask, __BIT(bit))) {
+			/* account for the counter, and reset */
+			uint64_t ctr = armv7_pmu_getset_pmevcntr(bit,
+			    sc->sc_count[bit].ctr_counter_reset_val);
+			counters_offset[bit] +=
+			    sc->sc_count[bit].ctr_counter_val + ctr;
+
+			/* record a sample */
+			tfi.tfi_pc = tf->tf_pc;
+			tfi.tfi_counter = bit;
+			tfi.tfi_inkernel =
+			    tfi.tfi_pc >= VM_MIN_KERNEL_ADDRESS &&
+			    tfi.tfi_pc < VM_MAX_KERNEL_ADDRESS;
+			tprof_sample(NULL, &tfi);
+		} else {
+			/* counter has overflowed */
+			counters_offset[bit] += __BIT(32);
+		}
 	}
-	armreg_pmovsr_write(pmovsr);
+	armreg_pmovsr_write(pmovs);
 
 	return 1;
 }
 
+static uint32_t
+armv7_pmu_ident(void)
+{
+	return TPROF_IDENT_ARMV7_GENERIC;
+}
+
+static const tprof_backend_ops_t tprof_armv7_pmu_ops = {
+	.tbo_ident = armv7_pmu_ident,
+	.tbo_ncounters = armv7_pmu_ncounters,
+	.tbo_counter_bitwidth = armv7_pmu_counter_bitwidth,
+	.tbo_counter_read = armv7_pmu_get_pmevcntr,
+	.tbo_counter_estimate_freq = armv7_pmu_counter_estimate_freq,
+	.tbo_valid_event = armv7_pmu_valid_event,
+	.tbo_configure_event = armv7_pmu_configure_event,
+	.tbo_start = armv7_pmu_start,
+	.tbo_stop = armv7_pmu_stop,
+	.tbo_establish = NULL,
+	.tbo_disestablish = NULL,
+};
+
 static void
 armv7_pmu_init_cpu(void *arg1, void *arg2)
 {
@@ -274,9 +310,21 @@ armv7_pmu_init_cpu(void *arg1, void *arg
 int
 armv7_pmu_init(void)
 {
+	int error, ncounters;
+
+	ncounters = armv7_pmu_ncounters();
+	if (ncounters == 0)
+		return ENOTSUP;
+
 	uint64_t xc = xc_broadcast(0, armv7_pmu_init_cpu, NULL, NULL);
 	xc_wait(xc);
 
-	return tprof_backend_register("tprof_armv7", &tprof_armv7_pmu_ops,
+	error = tprof_backend_register("tprof_armv7", &tprof_armv7_pmu_ops,
 	    TPROF_BACKEND_VERSION);
+	if (error == 0) {
+		/* XXX: for argument of armv7_pmu_intr() */
+		pmu_intr_arg = tprof_backend;
+	}
+
+	return error;
 }

Index: src/sys/dev/tprof/tprof_armv8.c
diff -u src/sys/dev/tprof/tprof_armv8.c:1.17 src/sys/dev/tprof/tprof_armv8.c:1.18
--- src/sys/dev/tprof/tprof_armv8.c:1.17	Thu Dec  1 00:29:10 2022
+++ src/sys/dev/tprof/tprof_armv8.c	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/* $NetBSD: tprof_armv8.c,v 1.17 2022/12/01 00:29:10 ryo Exp $ */
+/* $NetBSD: tprof_armv8.c,v 1.18 2022/12/01 00:32:52 ryo Exp $ */
 
 /*-
  * Copyright (c) 2018 Jared McNeill <jmcne...@invisible.ca>
@@ -27,11 +27,12 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tprof_armv8.c,v 1.17 2022/12/01 00:29:10 ryo Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tprof_armv8.c,v 1.18 2022/12/01 00:32:52 ryo Exp $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/cpu.h>
+#include <sys/percpu.h>
 #include <sys/xcall.h>
 
 #include <dev/tprof/tprof.h>
@@ -41,10 +42,12 @@ __KERNEL_RCSID(0, "$NetBSD: tprof_armv8.
 
 #include <dev/tprof/tprof_armv8.h>
 
-static tprof_param_t armv8_pmu_param;
-static const u_int armv8_pmu_counter = 0;
-static uint32_t counter_val;
-static uint32_t counter_reset_val;
+static u_int counter_bitwidth;
+
+/*
+ * armv8 can handle up to 31 event counters,
+ * PMCR_EL0.N counters are actually available.
+ */
 
 static bool
 armv8_pmu_event_implemented(uint16_t event)
@@ -75,137 +78,178 @@ armv8_pmu_set_pmevtyper(u_int counter, u
 	reg_pmxevtyper_el0_write(val);
 }
 
-static void
-armv8_pmu_set_pmevcntr(u_int counter, uint32_t val)
+static inline void
+armv8_pmu_set_pmevcntr(u_int counter, uint64_t val)
 {
 	reg_pmselr_el0_write(counter);
 	isb();
 	reg_pmxevcntr_el0_write(val);
 }
 
-static void
-armv8_pmu_start_cpu(void *arg1, void *arg2)
+static inline uint64_t
+armv8_pmu_get_pmevcntr(u_int counter)
 {
-	const uint32_t counter_mask = __BIT(armv8_pmu_counter);
-	uint64_t pmevtyper;
-
-	/* Disable event counter */
-	reg_pmcntenclr_el0_write(counter_mask);
-
-	/* Configure event counter */
-	pmevtyper = __SHIFTIN(armv8_pmu_param.p_event, PMEVTYPER_EVTCOUNT);
-	if (!ISSET(armv8_pmu_param.p_flags, TPROF_PARAM_USER))
-		pmevtyper |= PMEVTYPER_U;
-	if (!ISSET(armv8_pmu_param.p_flags, TPROF_PARAM_KERN))
-		pmevtyper |= PMEVTYPER_P;
-
-	armv8_pmu_set_pmevtyper(armv8_pmu_counter, pmevtyper);
-
-	/* Enable overflow interrupts */
-	reg_pmintenset_el1_write(counter_mask);
-
-	/* Clear overflow flag */
-	reg_pmovsclr_el0_write(counter_mask);
-
-	/* Initialize event counter value */
-	armv8_pmu_set_pmevcntr(armv8_pmu_counter, counter_reset_val);
-
-	/* Enable event counter */
-	reg_pmcntenset_el0_write(counter_mask);
-	reg_pmcr_el0_write(reg_pmcr_el0_read() | PMCR_E);
+	reg_pmselr_el0_write(counter);
+	isb();
+	return reg_pmxevcntr_el0_read();
 }
 
-static void
-armv8_pmu_stop_cpu(void *arg1, void *arg2)
+/* read and write at once */
+static inline uint64_t
+armv8_pmu_getset_pmevcntr(u_int counter, uint64_t val)
 {
-	const uint32_t counter_mask = __BIT(armv8_pmu_counter);
-
-	/* Disable overflow interrupts */
-	reg_pmintenclr_el1_write(counter_mask);
+	uint64_t c;
 
-	/* Disable event counter */
-	reg_pmcntenclr_el0_write(counter_mask);
+	reg_pmselr_el0_write(counter);
+	isb();
+	c = reg_pmxevcntr_el0_read();
+	reg_pmxevcntr_el0_write(val);
+	return c;
 }
 
-static uint64_t
-armv8_pmu_estimate_freq(void)
+static uint32_t
+armv8_pmu_ncounters(void)
 {
-	uint64_t cpufreq = curcpu()->ci_data.cpu_cc_freq;
-	uint64_t freq = 10000;
-
-	counter_val = cpufreq / freq;
-	if (counter_val == 0)
-		counter_val = 4000000000ULL / freq;
+	return __SHIFTOUT(reg_pmcr_el0_read(), PMCR_N);
+}
 
-	return freq;
+static u_int
+armv8_pmu_counter_bitwidth(u_int counter)
+{
+	return counter_bitwidth;
 }
 
-static uint32_t
-armv8_pmu_ident(void)
+static uint64_t
+armv8_pmu_counter_estimate_freq(u_int counter)
 {
-	return TPROF_IDENT_ARMV8_GENERIC;
+	return curcpu()->ci_data.cpu_cc_freq;
 }
 
 static int
-armv8_pmu_start(const tprof_param_t *param)
+armv8_pmu_valid_event(u_int counter, const tprof_param_t *param)
 {
-	/* PMCR.N of 0 means that no event counters are available */
-	if (__SHIFTOUT(reg_pmcr_el0_read(), PMCR_N) == 0) {
-		return EINVAL;
-	}
-
 	if (!armv8_pmu_event_implemented(param->p_event)) {
 		printf("%s: event %#" PRIx64 " not implemented on this CPU\n",
 		    __func__, param->p_event);
 		return EINVAL;
 	}
+	return 0;
+}
+
+static void
+armv8_pmu_configure_event(u_int counter, const tprof_param_t *param)
+{
+	/* Disable event counter */
+	reg_pmcntenclr_el0_write(__BIT(counter) & PMCNTEN_P);
 
-	counter_reset_val = -counter_val + 1;
+	/* Disable overflow interrupts */
+	reg_pmintenclr_el1_write(__BIT(counter) & PMINTEN_P);
 
-	armv8_pmu_param = *param;
-	uint64_t xc = xc_broadcast(0, armv8_pmu_start_cpu, NULL, NULL);
-	xc_wait(xc);
+	/* Configure event counter */
+	uint64_t pmevtyper = __SHIFTIN(param->p_event, PMEVTYPER_EVTCOUNT);
+	if (!ISSET(param->p_flags, TPROF_PARAM_USER))
+		pmevtyper |= PMEVTYPER_U;
+	if (!ISSET(param->p_flags, TPROF_PARAM_KERN))
+		pmevtyper |= PMEVTYPER_P;
+	armv8_pmu_set_pmevtyper(counter, pmevtyper);
 
-	return 0;
+	if (ISSET(param->p_flags, TPROF_PARAM_PROFILE) ||
+	    counter_bitwidth != 64) {
+		/* Enable overflow interrupts */
+		reg_pmintenset_el1_write(__BIT(counter) & PMINTEN_P);
+	}
+
+	/* Clear overflow flag */
+	reg_pmovsclr_el0_write(__BIT(counter) & PMOVS_P);
+
+	/* reset the counter */
+	armv8_pmu_set_pmevcntr(counter, param->p_value);
 }
 
 static void
-armv8_pmu_stop(const tprof_param_t *param)
+armv8_pmu_start(tprof_countermask_t runmask)
 {
-	uint64_t xc;
+	/* Enable event counters */
+	reg_pmcntenset_el0_write(runmask & PMCNTEN_P);
 
-	xc = xc_broadcast(0, armv8_pmu_stop_cpu, NULL, NULL);
-	xc_wait(xc);
+	/*
+	 * PMCR.E is shared with PMCCNTR_EL0 and event counters.
+	 * It is set here in case PMCCNTR_EL0 is not used in the system.
+	 */
+	reg_pmcr_el0_write(reg_pmcr_el0_read() | PMCR_E);
 }
 
-static const tprof_backend_ops_t tprof_armv8_pmu_ops = {
-	.tbo_estimate_freq = armv8_pmu_estimate_freq,
-	.tbo_ident = armv8_pmu_ident,
-	.tbo_start = armv8_pmu_start,
-	.tbo_stop = armv8_pmu_stop,
-};
+static void
+armv8_pmu_stop(tprof_countermask_t stopmask)
+{
+	/* Disable event counter */
+	reg_pmcntenclr_el0_write(stopmask & PMCNTEN_P);
+}
+
+/* XXX: argument of armv8_pmu_intr() */
+extern struct tprof_backend *tprof_backend;
+static void *pmu_intr_arg;
 
 int
 armv8_pmu_intr(void *priv)
 {
 	const struct trapframe * const tf = priv;
-	const uint32_t counter_mask = __BIT(armv8_pmu_counter);
+	tprof_backend_softc_t *sc = pmu_intr_arg;
 	tprof_frame_info_t tfi;
+	int bit;
+	const uint32_t pmovs = reg_pmovsset_el0_read() & PMOVS_P;
 
-	const uint32_t pmovs = reg_pmovsset_el0_read();
-	if ((pmovs & counter_mask) != 0) {
-		tfi.tfi_pc = tf->tf_pc;
-		tfi.tfi_inkernel = tfi.tfi_pc >= VM_MIN_KERNEL_ADDRESS &&
-		    tfi.tfi_pc < VM_MAX_KERNEL_ADDRESS;
-		tprof_sample(NULL, &tfi);
-
-		armv8_pmu_set_pmevcntr(armv8_pmu_counter, counter_reset_val);
+	uint64_t *counters_offset =
+	    percpu_getptr_remote(sc->sc_ctr_offset_percpu, curcpu());
+	uint32_t mask = pmovs;
+	while ((bit = ffs(mask)) != 0) {
+		bit--;
+		CLR(mask, __BIT(bit));
+
+		if (ISSET(sc->sc_ctr_prof_mask, __BIT(bit))) {
+			/* account for the counter, and reset */
+			uint64_t ctr = armv8_pmu_getset_pmevcntr(bit,
+			    sc->sc_count[bit].ctr_counter_reset_val);
+			counters_offset[bit] +=
+			    sc->sc_count[bit].ctr_counter_val + ctr;
+
+			/* record a sample */
+			tfi.tfi_pc = tf->tf_pc;
+			tfi.tfi_counter = bit;
+			tfi.tfi_inkernel =
+			    tfi.tfi_pc >= VM_MIN_KERNEL_ADDRESS &&
+			    tfi.tfi_pc < VM_MAX_KERNEL_ADDRESS;
+			tprof_sample(NULL, &tfi);
+		} else {
+			/* counter has overflowed */
+			counters_offset[bit] += __BIT(32);
+		}
 	}
 	reg_pmovsclr_el0_write(pmovs);
 
 	return 1;
 }
 
+static uint32_t
+armv8_pmu_ident(void)
+{
+	return TPROF_IDENT_ARMV8_GENERIC;
+}
+
+static const tprof_backend_ops_t tprof_armv8_pmu_ops = {
+	.tbo_ident = armv8_pmu_ident,
+	.tbo_ncounters = armv8_pmu_ncounters,
+	.tbo_counter_bitwidth = armv8_pmu_counter_bitwidth,
+	.tbo_counter_read = armv8_pmu_get_pmevcntr,
+	.tbo_counter_estimate_freq = armv8_pmu_counter_estimate_freq,
+	.tbo_valid_event = armv8_pmu_valid_event,
+	.tbo_configure_event = armv8_pmu_configure_event,
+	.tbo_start = armv8_pmu_start,
+	.tbo_stop = armv8_pmu_stop,
+	.tbo_establish = NULL,
+	.tbo_disestablish = NULL,
+};
+
 static void
 armv8_pmu_init_cpu(void *arg1, void *arg2)
 {
@@ -232,11 +276,32 @@ armv8_pmu_detect(void)
 int
 armv8_pmu_init(void)
 {
+	int error, ncounters;
+
 	KASSERT(armv8_pmu_detect());
 
+	ncounters = armv8_pmu_ncounters();
+	if (ncounters == 0)
+		return ENOTSUP;
+
+	/* Is 64bit event counter available? */
+	const uint64_t dfr0 = reg_id_aa64dfr0_el1_read();
+	const u_int pmuver = __SHIFTOUT(dfr0, ID_AA64DFR0_EL1_PMUVER);
+	if (pmuver >= ID_AA64DFR0_EL1_PMUVER_V3P5 &&
+	    ISSET(reg_pmcr_el0_read(), PMCR_LP))
+		counter_bitwidth = 64;
+	else
+		counter_bitwidth = 32;
+
 	uint64_t xc = xc_broadcast(0, armv8_pmu_init_cpu, NULL, NULL);
 	xc_wait(xc);
 
-	return tprof_backend_register("tprof_armv8", &tprof_armv8_pmu_ops,
+	error = tprof_backend_register("tprof_armv8", &tprof_armv8_pmu_ops,
 	    TPROF_BACKEND_VERSION);
+	if (error == 0) {
+		/* XXX: for argument of armv8_pmu_intr() */
+		pmu_intr_arg = tprof_backend;
+	}
+
+	return error;
 }

Index: src/sys/dev/tprof/tprof_ioctl.h
diff -u src/sys/dev/tprof/tprof_ioctl.h:1.4 src/sys/dev/tprof/tprof_ioctl.h:1.5
--- src/sys/dev/tprof/tprof_ioctl.h:1.4	Fri Jul 13 07:56:29 2018
+++ src/sys/dev/tprof/tprof_ioctl.h	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: tprof_ioctl.h,v 1.4 2018/07/13 07:56:29 maxv Exp $	*/
+/*	$NetBSD: tprof_ioctl.h,v 1.5 2022/12/01 00:32:52 ryo Exp $	*/
 
 /*-
  * Copyright (c)2008,2010 YAMAMOTO Takashi,
@@ -37,17 +37,12 @@
 
 #include <dev/tprof/tprof_types.h>
 
-#define	TPROF_VERSION	4	/* kernel-userland ABI version */
+#define	TPROF_VERSION	5	/* kernel-userland ABI version */
 
 struct tprof_info {
 	uint32_t ti_version;
 	uint32_t ti_ident;
 };
-#define	TPROF_IOC_GETINFO	_IOR('T', 1, struct tprof_info)
-
-#define	TPROF_IOC_START		_IOW('T', 2, tprof_param_t)
-
-#define	TPROF_IOC_STOP		_IO('T', 3)
 
 struct tprof_stat {
 	uint64_t ts_sample;	/* samples successfully recorded */
@@ -57,6 +52,13 @@ struct tprof_stat {
 	uint64_t ts_dropbuf;	/* buffers dropped due to the global limit */
 	uint64_t ts_dropbuf_sample; /* samples dropped with ts_dropbuf */
 };
-#define	TPROF_IOC_GETSTAT	_IOR('T', 4, struct tprof_stat)
+
+#define	TPROF_IOC_GETINFO		_IOR('T', 1, struct tprof_info)
+#define	TPROF_IOC_START			_IOW('T', 2, tprof_countermask_t)
+#define	TPROF_IOC_STOP			_IOW('T', 3, tprof_countermask_t)
+#define	TPROF_IOC_GETSTAT		_IOR('T', 4, struct tprof_stat)
+#define	TPROF_IOC_GETNCOUNTERS		_IOR('T', 5, u_int)
+#define	TPROF_IOC_CONFIGURE_EVENT	_IOW('T', 6, tprof_param_t)
+#define	TPROF_IOC_GETCOUNTS		_IOWR('T', 7, tprof_counts_t)
 
 #endif /* _DEV_TPROF_TPROF_IOCTL_H_ */
Index: src/sys/dev/tprof/tprof_x86_intel.c
diff -u src/sys/dev/tprof/tprof_x86_intel.c:1.4 src/sys/dev/tprof/tprof_x86_intel.c:1.5
--- src/sys/dev/tprof/tprof_x86_intel.c:1.4	Thu May 26 13:02:04 2022
+++ src/sys/dev/tprof/tprof_x86_intel.c	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: tprof_x86_intel.c,v 1.4 2022/05/26 13:02:04 msaitoh Exp $	*/
+/*	$NetBSD: tprof_x86_intel.c,v 1.5 2022/12/01 00:32:52 ryo Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -56,15 +56,15 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tprof_x86_intel.c,v 1.4 2022/05/26 13:02:04 msaitoh Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tprof_x86_intel.c,v 1.5 2022/12/01 00:32:52 ryo Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/device.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 
 #include <sys/cpu.h>
+#include <sys/percpu.h>
 #include <sys/xcall.h>
 
 #include <dev/tprof/tprof.h>
@@ -79,6 +79,12 @@ __KERNEL_RCSID(0, "$NetBSD: tprof_x86_in
 #include <machine/i82489reg.h>
 #include <machine/i82489var.h>
 
+#define	NCTRS	4	/* XXX */
+static u_int counter_bitwidth;
+
+#define	PERFEVTSEL(i)		(MSR_EVNTSEL0 + (i))
+#define	PERFCTR(i)		(MSR_PERFCTR0 + (i))
+
 #define	PERFEVTSEL_EVENT_SELECT	__BITS(0, 7)
 #define	PERFEVTSEL_UNIT_MASK	__BITS(8, 15)
 #define	PERFEVTSEL_USR		__BIT(16)
@@ -90,72 +96,115 @@ __KERNEL_RCSID(0, "$NetBSD: tprof_x86_in
 #define	PERFEVTSEL_INV		__BIT(23)
 #define	PERFEVTSEL_COUNTER_MASK	__BITS(24, 31)
 
-static uint64_t counter_bitwidth;
-static uint64_t counter_val = 5000000;
-static uint64_t counter_reset_val;
-
 static uint32_t intel_lapic_saved[MAXCPUS];
 static nmi_handler_t *intel_nmi_handle;
-static tprof_param_t intel_param;
+
+static uint32_t
+tprof_intel_ncounters(void)
+{
+	return NCTRS;
+}
+
+static u_int
+tprof_intel_counter_bitwidth(u_int counter)
+{
+	return counter_bitwidth;
+}
+
+static inline void
+tprof_intel_counter_write(u_int counter, uint64_t val)
+{
+	wrmsr(PERFCTR(counter), val);
+}
+
+static inline uint64_t
+tprof_intel_counter_read(u_int counter)
+{
+	return rdmsr(PERFCTR(counter));
+}
 
 static void
-tprof_intel_start_cpu(void *arg1, void *arg2)
+tprof_intel_configure_event(u_int counter, const tprof_param_t *param)
 {
-	struct cpu_info * const ci = curcpu();
 	uint64_t evtval;
 
 	evtval =
-	    __SHIFTIN(intel_param.p_event, PERFEVTSEL_EVENT_SELECT) |
-	    __SHIFTIN(intel_param.p_unit, PERFEVTSEL_UNIT_MASK) |
-	    ((intel_param.p_flags & TPROF_PARAM_USER) ? PERFEVTSEL_USR : 0) |
-	    ((intel_param.p_flags & TPROF_PARAM_KERN) ? PERFEVTSEL_OS : 0) |
-	    PERFEVTSEL_INT |
-	    PERFEVTSEL_EN;
+	    __SHIFTIN(param->p_event, PERFEVTSEL_EVENT_SELECT) |
+	    __SHIFTIN(param->p_unit, PERFEVTSEL_UNIT_MASK) |
+	    ((param->p_flags & TPROF_PARAM_USER) ? PERFEVTSEL_USR : 0) |
+	    ((param->p_flags & TPROF_PARAM_KERN) ? PERFEVTSEL_OS : 0) |
+	    PERFEVTSEL_INT;
+	wrmsr(PERFEVTSEL(counter), evtval);
 
-	wrmsr(MSR_PERFCTR0, counter_reset_val);
-	wrmsr(MSR_EVNTSEL0, evtval);
-
-	intel_lapic_saved[cpu_index(ci)] = lapic_readreg(LAPIC_LVT_PCINT);
-	lapic_writereg(LAPIC_LVT_PCINT, LAPIC_DLMODE_NMI);
+	/* reset the counter */
+	tprof_intel_counter_write(counter, param->p_value);
 }
 
 static void
-tprof_intel_stop_cpu(void *arg1, void *arg2)
+tprof_intel_start(tprof_countermask_t runmask)
 {
-	struct cpu_info * const ci = curcpu();
+	int bit;
 
-	wrmsr(MSR_EVNTSEL0, 0);
-	wrmsr(MSR_PERFCTR0, 0);
+	while ((bit = ffs(runmask)) != 0) {
+		bit--;
+		CLR(runmask, __BIT(bit));
+		wrmsr(PERFEVTSEL(bit), rdmsr(PERFEVTSEL(bit)) | PERFEVTSEL_EN);
+	}
+}
 
-	lapic_writereg(LAPIC_LVT_PCINT, intel_lapic_saved[cpu_index(ci)]);
+static void
+tprof_intel_stop(tprof_countermask_t stopmask)
+{
+	int bit;
+
+	while ((bit = ffs(stopmask)) != 0) {
+		bit--;
+		CLR(stopmask, __BIT(bit));
+		wrmsr(PERFEVTSEL(bit), rdmsr(PERFEVTSEL(bit)) & ~PERFEVTSEL_EN);
+	}
 }
 
 static int
-tprof_intel_nmi(const struct trapframe *tf, void *dummy)
+tprof_intel_nmi(const struct trapframe *tf, void *arg)
 {
-	uint32_t pcint;
-	uint64_t ctr;
+	tprof_backend_softc_t *sc = arg;
 	tprof_frame_info_t tfi;
+	uint32_t pcint;
+	int bit;
 
-	KASSERT(dummy == NULL);
-
-	ctr = rdmsr(MSR_PERFCTR0);
-	/* If the highest bit is non zero, then it's not for us. */
-	if ((ctr & __BIT(counter_bitwidth-1)) != 0) {
-		return 0;
-	}
+	uint64_t *counters_offset =
+	    percpu_getptr_remote(sc->sc_ctr_offset_percpu, curcpu());
+	tprof_countermask_t mask = sc->sc_ctr_ovf_mask;
+	while ((bit = ffs(mask)) != 0) {
+		bit--;
+		CLR(mask, __BIT(bit));
+
+		/* If the highest bit is non zero, then it's not for us. */
+		uint64_t ctr = tprof_intel_counter_read(bit);
+		if ((ctr & __BIT(counter_bitwidth - 1)) != 0)
+			continue;	/* not overflowed */
+
+		if (ISSET(sc->sc_ctr_prof_mask, __BIT(bit))) {
+			/* account for the counter, and reset */
+			tprof_intel_counter_write(bit,
+			    sc->sc_count[bit].ctr_counter_reset_val);
+			counters_offset[bit] +=
+			    sc->sc_count[bit].ctr_counter_val + ctr;
 
-	/* record a sample */
+			/* record a sample */
 #if defined(__x86_64__)
-	tfi.tfi_pc = tf->tf_rip;
+			tfi.tfi_pc = tf->tf_rip;
 #else
-	tfi.tfi_pc = tf->tf_eip;
+			tfi.tfi_pc = tf->tf_eip;
 #endif
-	tfi.tfi_inkernel = tfi.tfi_pc >= VM_MIN_KERNEL_ADDRESS;
-	tprof_sample(NULL, &tfi);
-
-	/* reset counter */
-	wrmsr(MSR_PERFCTR0, counter_reset_val);
+			tfi.tfi_counter = bit;
+			tfi.tfi_inkernel = tfi.tfi_pc >= VM_MIN_KERNEL_ADDRESS;
+			tprof_sample(NULL, &tfi);
+		} else {
+			/* not profiled, but require to consider overflow */
+			counters_offset[bit] += __BIT(counter_bitwidth);
+		}
+	}
 
 	/* unmask PMI */
 	pcint = lapic_readreg(LAPIC_LVT_PCINT);
@@ -166,16 +215,9 @@ tprof_intel_nmi(const struct trapframe *
 }
 
 static uint64_t
-tprof_intel_estimate_freq(void)
+tprof_intel_counter_estimate_freq(u_int counter)
 {
-	uint64_t cpufreq = curcpu()->ci_data.cpu_cc_freq;
-	uint64_t freq = 10000;
-
-	counter_val = cpufreq / freq;
-	if (counter_val == 0) {
-		counter_val = UINT64_C(4000000000) / freq;
-	}
-	return freq;
+	return curcpu()->ci_data.cpu_cc_freq;
 }
 
 static uint32_t
@@ -203,8 +245,25 @@ tprof_intel_ident(void)
 	return TPROF_IDENT_INTEL_GENERIC;
 }
 
+static void
+tprof_intel_establish_cpu(void *arg1, void *arg2)
+{
+	struct cpu_info * const ci = curcpu();
+
+	intel_lapic_saved[cpu_index(ci)] = lapic_readreg(LAPIC_LVT_PCINT);
+	lapic_writereg(LAPIC_LVT_PCINT, LAPIC_DLMODE_NMI);
+}
+
+static void
+tprof_intel_disestablish_cpu(void *arg1, void *arg2)
+{
+	struct cpu_info * const ci = curcpu();
+
+	lapic_writereg(LAPIC_LVT_PCINT, intel_lapic_saved[cpu_index(ci)]);
+}
+
 static int
-tprof_intel_start(const tprof_param_t *param)
+tprof_intel_establish(tprof_backend_softc_t *sc)
 {
 	uint64_t xc;
 
@@ -213,23 +272,20 @@ tprof_intel_start(const tprof_param_t *p
 	}
 
 	KASSERT(intel_nmi_handle == NULL);
-	intel_nmi_handle = nmi_establish(tprof_intel_nmi, NULL);
-
-	counter_reset_val = - counter_val + 1;
-	memcpy(&intel_param, param, sizeof(*param));
+	intel_nmi_handle = nmi_establish(tprof_intel_nmi, sc);
 
-	xc = xc_broadcast(0, tprof_intel_start_cpu, NULL, NULL);
+	xc = xc_broadcast(0, tprof_intel_establish_cpu, sc, NULL);
 	xc_wait(xc);
 
 	return 0;
 }
 
 static void
-tprof_intel_stop(const tprof_param_t *param)
+tprof_intel_disestablish(tprof_backend_softc_t *sc)
 {
 	uint64_t xc;
 
-	xc = xc_broadcast(0, tprof_intel_stop_cpu, NULL, NULL);
+	xc = xc_broadcast(0, tprof_intel_disestablish_cpu, sc, NULL);
 	xc_wait(xc);
 
 	KASSERT(intel_nmi_handle != NULL);
@@ -238,8 +294,15 @@ tprof_intel_stop(const tprof_param_t *pa
 }
 
 const tprof_backend_ops_t tprof_intel_ops = {
-	.tbo_estimate_freq = tprof_intel_estimate_freq,
 	.tbo_ident = tprof_intel_ident,
+	.tbo_ncounters = tprof_intel_ncounters,
+	.tbo_counter_bitwidth = tprof_intel_counter_bitwidth,
+	.tbo_counter_read = tprof_intel_counter_read,
+	.tbo_counter_estimate_freq = tprof_intel_counter_estimate_freq,
+	.tbo_valid_event = NULL,
+	.tbo_configure_event = tprof_intel_configure_event,
 	.tbo_start = tprof_intel_start,
 	.tbo_stop = tprof_intel_stop,
+	.tbo_establish = tprof_intel_establish,
+	.tbo_disestablish = tprof_intel_disestablish,
 };

Index: src/sys/dev/tprof/tprof_types.h
diff -u src/sys/dev/tprof/tprof_types.h:1.5 src/sys/dev/tprof/tprof_types.h:1.6
--- src/sys/dev/tprof/tprof_types.h:1.5	Sun Jul 15 23:46:25 2018
+++ src/sys/dev/tprof/tprof_types.h	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: tprof_types.h,v 1.5 2018/07/15 23:46:25 jmcneill Exp $	*/
+/*	$NetBSD: tprof_types.h,v 1.6 2022/12/01 00:32:52 ryo Exp $	*/
 
 /*-
  * Copyright (c)2010,2011 YAMAMOTO Takashi,
@@ -39,26 +39,55 @@
 #include <stdint.h>
 #endif
 
+#define TPROF_MAXCOUNTERS	32
+typedef uint32_t tprof_countermask_t;
+#define TPROF_COUNTERMASK_ALL	__BITS(31, 0)
+
 typedef struct {
 	uint32_t s_pid;		/* process id */
 	uint32_t s_lwpid;	/* lwp id */
 	uint32_t s_cpuid;	/* cpu id */
-	uint32_t s_flags;	/* flags */
+	uint32_t s_flags;	/* flags and counterID */
+#define TPROF_SAMPLE_INKERNEL	0x00000001 /* s_pc is in kernel address space */
+#define	TPROF_SAMPLE_COUNTER_MASK 0xff000000 /* 0..(TPROF_MAXCOUNTERS-1) */
 	uintptr_t s_pc;		/* program counter */
 } tprof_sample_t;
 
 typedef struct tprof_param {
+	u_int p_counter;	/* 0..(TPROF_MAXCOUNTERS-1) */
+	u_int p__unused;
 	uint64_t p_event;	/* event class */
 	uint64_t p_unit;	/* unit within the event class */
 	uint64_t p_flags;
+#define	TPROF_PARAM_KERN		0x1
+#define	TPROF_PARAM_USER		0x2
+#define	TPROF_PARAM_PROFILE		0x4
+#define	TPROF_PARAM_VALUE2_MASK		__BITS(63, 60)
+#define	TPROF_PARAM_VALUE2_SCALE	__SHIFTIN(1, TPROF_PARAM_VALUE2_MASK)
+#define	TPROF_PARAM_VALUE2_TRIGGERCOUNT	__SHIFTIN(2, TPROF_PARAM_VALUE2_MASK)
+	uint64_t p_value;	/* initial value */
+	uint64_t p_value2;
+	/*
+	 * p_value2 is an optional value. (p_flags & TPROF_PARAM_VALUE2_MASK)
+	 * determines the usage.
+	 *
+	 * TPROF_PARAM_VALUE2_SCALE:
+	 *   Specify the counter speed as the reciprocal of the cycle counter
+	 *   speed ratio. if the counter is N times slower than the cycle
+	 *   counter, p_value2 is (0x1_0000_0000 / N). 0 is treated as 1.0.
+	 * TPROF_PARAM_VALUE2_TRIGGERCOUNT:
+	 *   When the event counter counts up p_value2, an interrupt for profile
+	 *   is generated. 0 is treated as 1.
+	 */
 } tprof_param_t;
 
-/* s_flags */
-#define	TPROF_SAMPLE_INKERNEL	1	/* s_pc is in kernel address space */
-
-/* p_flags */
-#define	TPROF_PARAM_KERN	0x01
-#define	TPROF_PARAM_USER	0x02
+typedef struct tprof_counts {
+	uint32_t c_cpu;				/* W */
+	uint32_t c_ncounters;			/* R */
+	tprof_countermask_t c_runningmask;	/* R */
+	uint32_t c__unused;
+	uint64_t c_count[TPROF_MAXCOUNTERS];	/* R */
+} tprof_counts_t;
 
 /* ti_ident */
 #define	TPROF_IDENT_NONE		0x00
Index: src/sys/dev/tprof/tprof_x86_amd.c
diff -u src/sys/dev/tprof/tprof_x86_amd.c:1.5 src/sys/dev/tprof/tprof_x86_amd.c:1.6
--- src/sys/dev/tprof/tprof_x86_amd.c:1.5	Fri Oct 11 18:04:52 2019
+++ src/sys/dev/tprof/tprof_x86_amd.c	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: tprof_x86_amd.c,v 1.5 2019/10/11 18:04:52 jmcneill Exp $	*/
+/*	$NetBSD: tprof_x86_amd.c,v 1.6 2022/12/01 00:32:52 ryo Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -56,7 +56,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tprof_x86_amd.c,v 1.5 2019/10/11 18:04:52 jmcneill Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tprof_x86_amd.c,v 1.6 2022/12/01 00:32:52 ryo Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -64,6 +64,7 @@ __KERNEL_RCSID(0, "$NetBSD: tprof_x86_am
 #include <sys/module.h>
 
 #include <sys/cpu.h>
+#include <sys/percpu.h>
 #include <sys/xcall.h>
 
 #include <dev/tprof/tprof.h>
@@ -78,7 +79,8 @@ __KERNEL_RCSID(0, "$NetBSD: tprof_x86_am
 #include <machine/i82489reg.h>
 #include <machine/i82489var.h>
 
-#define	NCTRS	4
+#define	NCTRS			4
+#define	COUNTER_BITWIDTH	48
 
 #define	PERFEVTSEL(i)		(0xc0010000 + (i))
 #define	PERFCTR(i)		(0xc0010004 + (i))
@@ -106,92 +108,128 @@ __KERNEL_RCSID(0, "$NetBSD: tprof_x86_am
  * http://developer.amd.com/wordpress/media/2012/10/Basic_Performance_Measurements.pdf
  */
 
-static int ctrno = 0;
-static uint64_t counter_val = 5000000;
-static uint64_t counter_reset_val;
 static uint32_t amd_lapic_saved[MAXCPUS];
 static nmi_handler_t *amd_nmi_handle;
-static tprof_param_t amd_param;
+
+static uint32_t
+tprof_amd_ncounters(void)
+{
+	return NCTRS;
+}
+
+static u_int
+tprof_amd_counter_bitwidth(u_int counter)
+{
+	return COUNTER_BITWIDTH;
+}
+
+static inline void
+tprof_amd_counter_write(u_int counter, uint64_t val)
+{
+	wrmsr(PERFCTR(counter), val);
+}
+
+static inline uint64_t
+tprof_amd_counter_read(u_int counter)
+{
+	return rdmsr(PERFCTR(counter));
+}
 
 static void
-tprof_amd_start_cpu(void *arg1, void *arg2)
+tprof_amd_configure_event(u_int counter, const tprof_param_t *param)
 {
-	struct cpu_info * const ci = curcpu();
 	uint64_t pesr;
 	uint64_t event_lo;
 	uint64_t event_hi;
 
-	event_hi = amd_param.p_event >> 8;
-	event_lo = amd_param.p_event & 0xff;
+	event_hi = param->p_event >> 8;
+	event_lo = param->p_event & 0xff;
 	pesr =
-	    ((amd_param.p_flags & TPROF_PARAM_USER) ? PESR_USR : 0) |
-	    ((amd_param.p_flags & TPROF_PARAM_KERN) ? PESR_OS : 0) |
+	    ((param->p_flags & TPROF_PARAM_USER) ? PESR_USR : 0) |
+	    ((param->p_flags & TPROF_PARAM_KERN) ? PESR_OS : 0) |
 	    PESR_INT |
 	    __SHIFTIN(event_lo, PESR_EVENT_MASK_LO) |
 	    __SHIFTIN(event_hi, PESR_EVENT_MASK_HI) |
 	    __SHIFTIN(0, PESR_COUNTER_MASK) |
-	    __SHIFTIN(amd_param.p_unit, PESR_UNIT_MASK);
+	    __SHIFTIN(param->p_unit, PESR_UNIT_MASK);
+	wrmsr(PERFEVTSEL(counter), pesr);
 
-	wrmsr(PERFCTR(ctrno), counter_reset_val);
-	wrmsr(PERFEVTSEL(ctrno), pesr);
+	/* reset the counter */
+	tprof_amd_counter_write(counter, param->p_value);
+}
 
-	amd_lapic_saved[cpu_index(ci)] = lapic_readreg(LAPIC_LVT_PCINT);
-	lapic_writereg(LAPIC_LVT_PCINT, LAPIC_DLMODE_NMI);
+static void
+tprof_amd_start(tprof_countermask_t runmask)
+{
+	int bit;
 
-	wrmsr(PERFEVTSEL(ctrno), pesr | PESR_EN);
+	while ((bit = ffs(runmask)) != 0) {
+		bit--;
+		CLR(runmask, __BIT(bit));
+		wrmsr(PERFEVTSEL(bit), rdmsr(PERFEVTSEL(bit)) | PESR_EN);
+	}
 }
 
 static void
-tprof_amd_stop_cpu(void *arg1, void *arg2)
+tprof_amd_stop(tprof_countermask_t stopmask)
 {
-	struct cpu_info * const ci = curcpu();
+	int bit;
 
-	wrmsr(PERFEVTSEL(ctrno), 0);
-
-	lapic_writereg(LAPIC_LVT_PCINT, amd_lapic_saved[cpu_index(ci)]);
+	while ((bit = ffs(stopmask)) != 0) {
+		bit--;
+		CLR(stopmask, __BIT(bit));
+		wrmsr(PERFEVTSEL(bit), rdmsr(PERFEVTSEL(bit)) & ~PESR_EN);
+	}
 }
 
 static int
-tprof_amd_nmi(const struct trapframe *tf, void *dummy)
+tprof_amd_nmi(const struct trapframe *tf, void *arg)
 {
+	tprof_backend_softc_t *sc = arg;
 	tprof_frame_info_t tfi;
-	uint64_t ctr;
-
-	KASSERT(dummy == NULL);
+	int bit;
 
-	/* check if it's for us */
-	ctr = rdmsr(PERFCTR(ctrno));
-	if ((ctr & (UINT64_C(1) << 63)) != 0) { /* check if overflowed */
-		/* not ours */
-		return 0;
-	}
+	uint64_t *counters_offset =
+	    percpu_getptr_remote(sc->sc_ctr_offset_percpu, curcpu());
+	tprof_countermask_t mask = sc->sc_ctr_ovf_mask;
+	while ((bit = ffs(mask)) != 0) {
+		bit--;
+		CLR(mask, __BIT(bit));
+
+		/* If the highest bit is non zero, then it's not for us. */
+		uint64_t ctr = tprof_amd_counter_read(bit);
+		if ((ctr & __BIT(COUNTER_BITWIDTH - 1)) != 0)
+			continue;	/* not overflowed */
+
+		if (ISSET(sc->sc_ctr_prof_mask, __BIT(bit))) {
+			/* account for the counter, and reset */
+			tprof_amd_counter_write(bit,
+			    sc->sc_count[bit].ctr_counter_reset_val);
+			counters_offset[bit] +=
+			    sc->sc_count[bit].ctr_counter_val + ctr;
 
-	/* record a sample */
+			/* record a sample */
 #if defined(__x86_64__)
-	tfi.tfi_pc = tf->tf_rip;
+			tfi.tfi_pc = tf->tf_rip;
 #else
-	tfi.tfi_pc = tf->tf_eip;
+			tfi.tfi_pc = tf->tf_eip;
 #endif
-	tfi.tfi_inkernel = tfi.tfi_pc >= VM_MIN_KERNEL_ADDRESS;
-	tprof_sample(NULL, &tfi);
-
-	/* reset counter */
-	wrmsr(PERFCTR(ctrno), counter_reset_val);
+			tfi.tfi_counter = bit;
+			tfi.tfi_inkernel = tfi.tfi_pc >= VM_MIN_KERNEL_ADDRESS;
+			tprof_sample(NULL, &tfi);
+		} else {
+			/* not profiled, but require to consider overflow */
+			counters_offset[bit] += __BIT(COUNTER_BITWIDTH);
+		}
+	}
 
 	return 1;
 }
 
 static uint64_t
-tprof_amd_estimate_freq(void)
+tprof_amd_counter_estimate_freq(u_int counter)
 {
-	uint64_t cpufreq = curcpu()->ci_data.cpu_cc_freq;
-	uint64_t freq = 10000;
-
-	counter_val = cpufreq / freq;
-	if (counter_val == 0) {
-		counter_val = UINT64_C(4000000000) / freq;
-	}
-	return freq;
+	return curcpu()->ci_data.cpu_cc_freq;
 }
 
 static uint32_t
@@ -213,8 +251,25 @@ tprof_amd_ident(void)
 	return TPROF_IDENT_NONE;
 }
 
+static void
+tprof_amd_establish_cpu(void *arg1, void *arg2)
+{
+	struct cpu_info * const ci = curcpu();
+
+	amd_lapic_saved[cpu_index(ci)] = lapic_readreg(LAPIC_LVT_PCINT);
+	lapic_writereg(LAPIC_LVT_PCINT, LAPIC_DLMODE_NMI);
+}
+
+static void
+tprof_amd_disestablish_cpu(void *arg1, void *arg2)
+{
+	struct cpu_info * const ci = curcpu();
+
+	lapic_writereg(LAPIC_LVT_PCINT, amd_lapic_saved[cpu_index(ci)]);
+}
+
 static int
-tprof_amd_start(const tprof_param_t *param)
+tprof_amd_establish(tprof_backend_softc_t *sc)
 {
 	uint64_t xc;
 
@@ -223,23 +278,20 @@ tprof_amd_start(const tprof_param_t *par
 	}
 
 	KASSERT(amd_nmi_handle == NULL);
-	amd_nmi_handle = nmi_establish(tprof_amd_nmi, NULL);
-
-	counter_reset_val = - counter_val + 1;
-	memcpy(&amd_param, param, sizeof(*param));
+	amd_nmi_handle = nmi_establish(tprof_amd_nmi, sc);
 
-	xc = xc_broadcast(0, tprof_amd_start_cpu, NULL, NULL);
+	xc = xc_broadcast(0, tprof_amd_establish_cpu, sc, NULL);
 	xc_wait(xc);
 
 	return 0;
 }
 
 static void
-tprof_amd_stop(const tprof_param_t *param)
+tprof_amd_disestablish(tprof_backend_softc_t *sc)
 {
 	uint64_t xc;
 
-	xc = xc_broadcast(0, tprof_amd_stop_cpu, NULL, NULL);
+	xc = xc_broadcast(0, tprof_amd_disestablish_cpu, sc, NULL);
 	xc_wait(xc);
 
 	KASSERT(amd_nmi_handle != NULL);
@@ -248,8 +300,15 @@ tprof_amd_stop(const tprof_param_t *para
 }
 
 const tprof_backend_ops_t tprof_amd_ops = {
-	.tbo_estimate_freq = tprof_amd_estimate_freq,
 	.tbo_ident = tprof_amd_ident,
+	.tbo_ncounters = tprof_amd_ncounters,
+	.tbo_counter_bitwidth = tprof_amd_counter_bitwidth,
+	.tbo_counter_read = tprof_amd_counter_read,
+	.tbo_counter_estimate_freq = tprof_amd_counter_estimate_freq,
+	.tbo_valid_event = NULL,
+	.tbo_configure_event = tprof_amd_configure_event,
 	.tbo_start = tprof_amd_start,
 	.tbo_stop = tprof_amd_stop,
+	.tbo_establish = tprof_amd_establish,
+	.tbo_disestablish = tprof_amd_disestablish,
 };

Index: src/sys/dev/tprof/tprof_x86.c
diff -u src/sys/dev/tprof/tprof_x86.c:1.1 src/sys/dev/tprof/tprof_x86.c:1.2
--- src/sys/dev/tprof/tprof_x86.c:1.1	Tue Jul 24 09:47:35 2018
+++ src/sys/dev/tprof/tprof_x86.c	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: tprof_x86.c,v 1.1 2018/07/24 09:47:35 maxv Exp $	*/
+/*	$NetBSD: tprof_x86.c,v 1.2 2022/12/01 00:32:52 ryo Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tprof_x86.c,v 1.1 2018/07/24 09:47:35 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tprof_x86.c,v 1.2 2022/12/01 00:32:52 ryo Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -54,16 +54,28 @@ extern const tprof_backend_ops_t tprof_i
 static int
 tprof_x86_init(void)
 {
+	const tprof_backend_ops_t *ops;
+	const char *name;
+	int ncounters;
+
 	switch (cpu_vendor) {
 	case CPUVENDOR_AMD:
-		return tprof_backend_register("tprof_amd", &tprof_amd_ops,
-		    TPROF_BACKEND_VERSION);
+		name = "tprof_amd";
+		ops = &tprof_amd_ops;
+		break;
 	case CPUVENDOR_INTEL:
-		return tprof_backend_register("tprof_intel", &tprof_intel_ops,
-		    TPROF_BACKEND_VERSION);
+		name = "tprof_intel";
+		ops = &tprof_intel_ops;
+		break;
 	default:
 		return ENOTSUP;
 	}
+
+	ncounters = ops->tbo_ncounters();
+	if (ncounters == 0)
+		return ENOTSUP;
+
+	return tprof_backend_register(name, ops, TPROF_BACKEND_VERSION);
 }
 
 static int

Index: src/usr.sbin/tprof/tprof.8
diff -u src/usr.sbin/tprof/tprof.8:1.16 src/usr.sbin/tprof/tprof.8:1.17
--- src/usr.sbin/tprof/tprof.8:1.16	Wed May 25 06:17:19 2022
+++ src/usr.sbin/tprof/tprof.8	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-.\"	$NetBSD: tprof.8,v 1.16 2022/05/25 06:17:19 msaitoh Exp $
+.\"	$NetBSD: tprof.8,v 1.17 2022/12/01 00:32:52 ryo Exp $
 .\"
 .\" Copyright (c)2011 YAMAMOTO Takashi,
 .\" All rights reserved.
@@ -24,7 +24,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd October 11, 2019
+.Dd December 1, 2022
 .Dt TPROF 8
 .Os
 .Sh NAME
@@ -67,6 +67,7 @@ Display a list of performance counter ev
 .It monitor Xo
 .Fl e
 .Ar name:option
+.Op Fl e Ar ...
 .Op Fl o Ar outfile
 .Ar command
 .Xc

Index: src/usr.sbin/tprof/tprof.c
diff -u src/usr.sbin/tprof/tprof.c:1.13 src/usr.sbin/tprof/tprof.c:1.14
--- src/usr.sbin/tprof/tprof.c:1.13	Tue Jul 24 09:50:37 2018
+++ src/usr.sbin/tprof/tprof.c	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: tprof.c,v 1.13 2018/07/24 09:50:37 maxv Exp $	*/
+/*	$NetBSD: tprof.c,v 1.14 2022/12/01 00:32:52 ryo Exp $	*/
 
 /*
  * Copyright (c) 2018 The NetBSD Foundation, Inc.
@@ -57,7 +57,7 @@
 
 #include <sys/cdefs.h>
 #ifndef lint
-__RCSID("$NetBSD: tprof.c,v 1.13 2018/07/24 09:50:37 maxv Exp $");
+__RCSID("$NetBSD: tprof.c,v 1.14 2022/12/01 00:32:52 ryo Exp $");
 #endif /* not lint */
 
 #include <sys/ioctl.h>
@@ -80,8 +80,11 @@ __RCSID("$NetBSD: tprof.c,v 1.13 2018/07
 
 #define	_PATH_TPROF	"/dev/tprof"
 
+struct tprof_info tprof_info;
+u_int ncounters;
 int devfd;
 int outfd;
+u_int nevent;
 
 static void tprof_list(int, char **);
 static void tprof_monitor(int, char **) __dead;
@@ -106,7 +109,7 @@ usage(void)
 	fprintf(stderr, "\n");
 	fprintf(stderr, "\tlist\n");
 	fprintf(stderr, "\t\tList the available events.\n");
-	fprintf(stderr, "\tmonitor -e name:option [-o outfile] command\n");
+	fprintf(stderr, "\tmonitor -e name:option [-e ...] [-o outfile] command\n");
 	fprintf(stderr, "\t\tMonitor the event 'name' with option 'option'\n"
 	    "\t\tcounted during the execution of 'command'.\n");
 	fprintf(stderr, "\tanalyze [-CkLPs] [-p pid] file\n");
@@ -156,14 +159,15 @@ static void
 tprof_monitor(int argc, char **argv)
 {
 	const char *outfile = "tprof.out";
-	struct tprof_param param;
 	struct tprof_stat ts;
+	tprof_param_t params[TPROF_MAXCOUNTERS];
 	pid_t pid;
 	pthread_t pt;
-	int ret, ch;
+	int ret, ch, i;
 	char *tokens[2];
+	tprof_countermask_t mask = TPROF_COUNTERMASK_ALL;
 
-	memset(&param, 0, sizeof(param));
+	memset(params, 0, sizeof(params));
 
 	while ((ch = getopt(argc, argv, "o:e:")) != -1) {
 		switch (ch) {
@@ -175,11 +179,17 @@ tprof_monitor(int argc, char **argv)
 			tokens[1] = strtok(NULL, ":");
 			if (tokens[1] == NULL)
 				usage();
-			tprof_event_lookup(tokens[0], &param);
+			tprof_event_lookup(tokens[0], &params[nevent]);
 			if (strchr(tokens[1], 'u'))
-				param.p_flags |= TPROF_PARAM_USER;
+				params[nevent].p_flags |= TPROF_PARAM_USER;
 			if (strchr(tokens[1], 'k'))
-				param.p_flags |= TPROF_PARAM_KERN;
+				params[nevent].p_flags |= TPROF_PARAM_KERN;
+			if (params[nevent].p_flags == 0)
+				usage();
+			nevent++;
+			if (nevent > __arraycount(params) ||
+			    nevent > ncounters)
+				errx(EXIT_FAILURE, "Too many events");
 			break;
 		default:
 			usage();
@@ -187,11 +197,7 @@ tprof_monitor(int argc, char **argv)
 	}
 	argc -= optind;
 	argv += optind;
-	if (argc == 0) {
-		usage();
-	}
-
-	if (param.p_flags == 0) {
+	if (argc == 0 || nevent == 0) {
 		usage();
 	}
 
@@ -200,7 +206,15 @@ tprof_monitor(int argc, char **argv)
 		err(EXIT_FAILURE, "%s", outfile);
 	}
 
-	ret = ioctl(devfd, TPROF_IOC_START, &param);
+	for (i = 0; i < (int)nevent; i++) {
+		params[i].p_counter = i;
+		params[i].p_flags |= TPROF_PARAM_PROFILE;
+		ret = ioctl(devfd, TPROF_IOC_CONFIGURE_EVENT, &params[i]);
+		if (ret == -1)
+			err(EXIT_FAILURE, "TPROF_IOC_CONFIGURE_EVENT");
+	}
+
+	ret = ioctl(devfd, TPROF_IOC_START, &mask);
 	if (ret == -1) {
 		err(EXIT_FAILURE, "TPROF_IOC_START");
 	}
@@ -237,7 +251,7 @@ tprof_monitor(int argc, char **argv)
 		}
 	}
 
-	ret = ioctl(devfd, TPROF_IOC_STOP, NULL);
+	ret = ioctl(devfd, TPROF_IOC_STOP, &mask);
 	if (ret == -1) {
 		err(EXIT_FAILURE, "TPROF_IOC_STOP");
 	}
@@ -263,7 +277,6 @@ tprof_monitor(int argc, char **argv)
 int
 main(int argc, char *argv[])
 {
-	struct tprof_info info;
 	const struct cmdtab *ct;
 	int ret;
 
@@ -275,18 +288,26 @@ main(int argc, char *argv[])
 		err(EXIT_FAILURE, "%s", _PATH_TPROF);
 	}
 
-	ret = ioctl(devfd, TPROF_IOC_GETINFO, &info);
+	ret = ioctl(devfd, TPROF_IOC_GETINFO, &tprof_info);
 	if (ret == -1) {
 		err(EXIT_FAILURE, "TPROF_IOC_GETINFO");
 	}
-	if (info.ti_version != TPROF_VERSION) {
+	if (tprof_info.ti_version != TPROF_VERSION) {
 		errx(EXIT_FAILURE, "version mismatch: version=%d, expected=%d",
-		    info.ti_version, TPROF_VERSION);
+		    tprof_info.ti_version, TPROF_VERSION);
 	}
-	if (tprof_event_init(info.ti_ident) == -1) {
+	if (tprof_event_init(tprof_info.ti_ident) == -1) {
 		errx(EXIT_FAILURE, "cpu not supported");
 	}
 
+	ret = ioctl(devfd, TPROF_IOC_GETNCOUNTERS, &ncounters);
+	if (ret == -1) {
+		err(EXIT_FAILURE, "TPROF_IOC_GETNCOUNTERS");
+	}
+	if (ncounters == 0) {
+		errx(EXIT_FAILURE, "no available counters");
+	}
+
 	if (argc == 0)
 		usage();
 

Index: src/usr.sbin/tprof/tprof_analyze.c
diff -u src/usr.sbin/tprof/tprof_analyze.c:1.5 src/usr.sbin/tprof/tprof_analyze.c:1.6
--- src/usr.sbin/tprof/tprof_analyze.c:1.5	Thu Oct 14 09:52:40 2021
+++ src/usr.sbin/tprof/tprof_analyze.c	Thu Dec  1 00:32:52 2022
@@ -1,4 +1,4 @@
-/*	$NetBSD: tprof_analyze.c,v 1.5 2021/10/14 09:52:40 skrll Exp $	*/
+/*	$NetBSD: tprof_analyze.c,v 1.6 2022/12/01 00:32:52 ryo Exp $	*/
 
 /*
  * Copyright (c) 2010,2011,2012 YAMAMOTO Takashi,
@@ -28,7 +28,7 @@
 
 #include <sys/cdefs.h>
 #ifndef lint
-__RCSID("$NetBSD: tprof_analyze.c,v 1.5 2021/10/14 09:52:40 skrll Exp $");
+__RCSID("$NetBSD: tprof_analyze.c,v 1.6 2022/12/01 00:32:52 ryo Exp $");
 #endif /* not lint */
 
 #include <assert.h>
@@ -63,6 +63,7 @@ struct addr {
 	uint32_t cpuid;		/* cpu id */
 	bool in_kernel;		/* if addr is in the kernel address space */
 	unsigned int nsamples;	/* number of samples taken for the address */
+	unsigned int ncount[TPROF_MAXCOUNTERS];	/* count per event */
 };
 
 static rb_tree_t addrtree;
@@ -278,6 +279,7 @@ tprof_analyze(int argc, char **argv)
 	size_t naddrs, nsamples, i;
 	float perc;
 	int ch;
+	u_int c, maxevent = 0;
 	bool distinguish_processes = true;
 	bool distinguish_cpus = true;
 	bool distinguish_lwps = true;
@@ -363,6 +365,7 @@ tprof_analyze(int argc, char **argv)
 			continue;
 		}
 		a = emalloc(sizeof(*a));
+		memset(a, 0, sizeof(*a));
 		a->addr = (uint64_t)sample.s_pc;
 		if (distinguish_processes) {
 			a->pid = sample.s_pid;
@@ -389,7 +392,13 @@ tprof_analyze(int argc, char **argv)
 				a->addr -= offset;
 			}
 		}
+		c = __SHIFTOUT(sample.s_flags, TPROF_SAMPLE_COUNTER_MASK);
+		assert(c < TPROF_MAXCOUNTERS);
+		if (maxevent < c)
+			maxevent = c;
+
 		a->nsamples = 1;
+		a->ncount[c] = 1;
 		o = rb_tree_insert_node(&addrtree, a);
 		if (o != a) {
 			assert(a->addr == o->addr);
@@ -398,7 +407,9 @@ tprof_analyze(int argc, char **argv)
 			assert(a->cpuid == o->cpuid);
 			assert(a->in_kernel == o->in_kernel);
 			free(a);
+
 			o->nsamples++;
+			o->ncount[c]++;
 		} else {
 			naddrs++;
 		}
@@ -423,8 +434,17 @@ tprof_analyze(int argc, char **argv)
 	 */
 	printf("File: %s\n", argv[0]);
 	printf("Number of samples: %zu\n\n", nsamples);
-	printf("percentage   nsamples pid    lwp    cpu  k address          symbol\n");
-	printf("------------ -------- ------ ------ ---- - ---------------- ------\n");
+
+	printf("percentage   nsamples ");
+	for (c = 0; c <= maxevent; c++)
+		printf("event#%02u ", c);
+	printf("pid    lwp    cpu  k address          symbol\n");
+
+	printf("------------ -------- ");
+	for (c = 0; c <= maxevent; c++)
+		printf("-------- ");
+
+	printf("------ ------ ---- - ---------------- ------\n");
 	for (i = 0; i < naddrs; i++) {
 		const char *name;
 		char buf[100];
@@ -448,11 +468,17 @@ tprof_analyze(int argc, char **argv)
 
 		perc = ((float)a->nsamples / (float)nsamples) * 100.0;
 
-		printf("%11f%% %8u %6" PRIu32 " %6" PRIu32 " %4" PRIu32 " %u %016"
-		    PRIx64 " %s\n",
-		    perc,
-		    a->nsamples, a->pid, a->lwpid, a->cpuid, a->in_kernel,
-		    a->addr, name);
+		printf("%11f%% %8u", perc, a->nsamples);
+
+		for (c = 0; c <= maxevent; c++)
+			printf(" %8u", a->ncount[c]);
+
+		printf(" %6" PRIu32 " %6" PRIu32 " %4" PRIu32 " %u %016"
+		    PRIx64" %s",
+		    a->pid, a->lwpid, a->cpuid, a->in_kernel, a->addr, name);
+
+
+		printf("\n");
 	}
 
 	fclose(f);

Reply via email to