Module Name:    src
Committed By:   thorpej
Date:           Sat Aug 29 20:07:00 UTC 2020

Modified Files:
        src/sys/arch/alpha/alpha: cpu.c ipifuncs.c pmap.c vm_machdep.c
        src/sys/arch/alpha/include: cpu.h intr.h pmap.h

Log Message:
- Centralize per-CPU pmap initialization into a new pmap_init_cpu()
  function.  Call in from pmap_bootstrap() for the boot CPU, and
  from cpu_hatch() for secondaary CPUs.
- Eliminiate the dedicated I-stream memory barrier IPI; handle it all from
  the TLB shootdown IPI.  Const poison, and add some additional memory
  barriers and a TBIA to the PAUSE IPI.
- Completly rewrite TLB management in the alpha pmap module, borrowing
  somoe ideas from the x86 pmap and adapting them to the alpha environment.
  See the comments for theory of operation.  Add a bunch of stats that
  can be reported (disabled by default).
- Add some additional symbol decorations to improve cache behavior on
  MP systems.  Ensure coherency unit alignment for several structures
  in the pmap module.  Use hashed locks for pmap structures.
- Start out all new processes on the kernel page tables until their
  first trip though pmap_activate() to avoid the potential of polluting
  the current ASN in TLB with cross-process mappings.


To generate a diff of this commit:
cvs rdiff -u -r1.98 -r1.99 src/sys/arch/alpha/alpha/cpu.c
cvs rdiff -u -r1.51 -r1.52 src/sys/arch/alpha/alpha/ipifuncs.c
cvs rdiff -u -r1.268 -r1.269 src/sys/arch/alpha/alpha/pmap.c
cvs rdiff -u -r1.115 -r1.116 src/sys/arch/alpha/alpha/vm_machdep.c
cvs rdiff -u -r1.88 -r1.89 src/sys/arch/alpha/include/cpu.h
cvs rdiff -u -r1.72 -r1.73 src/sys/arch/alpha/include/intr.h
cvs rdiff -u -r1.82 -r1.83 src/sys/arch/alpha/include/pmap.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/sys/arch/alpha/alpha/cpu.c
diff -u src/sys/arch/alpha/alpha/cpu.c:1.98 src/sys/arch/alpha/alpha/cpu.c:1.99
--- src/sys/arch/alpha/alpha/cpu.c:1.98	Fri Aug  7 14:20:08 2020
+++ src/sys/arch/alpha/alpha/cpu.c	Sat Aug 29 20:06:59 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.c,v 1.98 2020/08/07 14:20:08 fcambus Exp $ */
+/* $NetBSD: cpu.c,v 1.99 2020/08/29 20:06:59 thorpej Exp $ */
 
 /*-
  * Copyright (c) 1998, 1999, 2000, 2001 The NetBSD Foundation, Inc.
@@ -59,7 +59,7 @@
 
 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
 
-__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.98 2020/08/07 14:20:08 fcambus Exp $");
+__KERNEL_RCSID(0, "$NetBSD: cpu.c,v 1.99 2020/08/29 20:06:59 thorpej Exp $");
 
 #include "opt_ddb.h"
 #include "opt_multiprocessor.h"
@@ -549,8 +549,8 @@ cpu_hatch(struct cpu_info *ci)
 	u_long cpu_id = cpu_number();
 	u_long cpumask = (1UL << cpu_id);
 
-	/* Mark the kernel pmap active on this processor. */
-	atomic_or_ulong(&pmap_kernel()->pm_cpus, cpumask);
+	/* pmap initialization for this processor. */
+	pmap_init_cpu(ci);
 
 	/* Initialize trap vectors for this processor. */
 	trap_init();

Index: src/sys/arch/alpha/alpha/ipifuncs.c
diff -u src/sys/arch/alpha/alpha/ipifuncs.c:1.51 src/sys/arch/alpha/alpha/ipifuncs.c:1.52
--- src/sys/arch/alpha/alpha/ipifuncs.c:1.51	Sat Aug 15 16:09:07 2020
+++ src/sys/arch/alpha/alpha/ipifuncs.c	Sat Aug 29 20:06:59 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: ipifuncs.c,v 1.51 2020/08/15 16:09:07 thorpej Exp $ */
+/* $NetBSD: ipifuncs.c,v 1.52 2020/08/29 20:06:59 thorpej Exp $ */
 
 /*-
  * Copyright (c) 1998, 1999, 2000, 2001 The NetBSD Foundation, Inc.
@@ -32,7 +32,7 @@
 
 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
 
-__KERNEL_RCSID(0, "$NetBSD: ipifuncs.c,v 1.51 2020/08/15 16:09:07 thorpej Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ipifuncs.c,v 1.52 2020/08/29 20:06:59 thorpej Exp $");
 
 /*
  * Interprocessor interrupt handlers.
@@ -62,21 +62,15 @@ typedef void (*ipifunc_t)(struct cpu_inf
 
 static void	alpha_ipi_halt(struct cpu_info *, struct trapframe *);
 static void	alpha_ipi_microset(struct cpu_info *, struct trapframe *);
-static void	alpha_ipi_imb(struct cpu_info *, struct trapframe *);
 static void	alpha_ipi_ast(struct cpu_info *, struct trapframe *);
 static void	alpha_ipi_pause(struct cpu_info *, struct trapframe *);
 static void	alpha_ipi_xcall(struct cpu_info *, struct trapframe *);
 static void	alpha_ipi_generic(struct cpu_info *, struct trapframe *);
 
-/*
- * NOTE: This table must be kept in order with the bit definitions
- * in <machine/intr.h>.
- */
 const ipifunc_t ipifuncs[ALPHA_NIPIS] = {
 	[ilog2(ALPHA_IPI_HALT)] =	alpha_ipi_halt,
 	[ilog2(ALPHA_IPI_MICROSET)] =	alpha_ipi_microset,
-	[ilog2(ALPHA_IPI_SHOOTDOWN)] =	pmap_do_tlb_shootdown,
-	[ilog2(ALPHA_IPI_IMB)] =	alpha_ipi_imb,
+	[ilog2(ALPHA_IPI_SHOOTDOWN)] =	pmap_tlb_shootdown_ipi,
 	[ilog2(ALPHA_IPI_AST)] =	alpha_ipi_ast,
 	[ilog2(ALPHA_IPI_PAUSE)] =	alpha_ipi_pause,
 	[ilog2(ALPHA_IPI_XCALL)] =	alpha_ipi_xcall,
@@ -87,7 +81,6 @@ const char * const ipinames[ALPHA_NIPIS]
 	[ilog2(ALPHA_IPI_HALT)] =	"halt ipi",
 	[ilog2(ALPHA_IPI_MICROSET)] =	"microset ipi",
 	[ilog2(ALPHA_IPI_SHOOTDOWN)] =	"shootdown ipi",
-	[ilog2(ALPHA_IPI_IMB)] =	"imb ipi",
 	[ilog2(ALPHA_IPI_AST)] =	"ast ipi",
 	[ilog2(ALPHA_IPI_PAUSE)] =	"pause ipi",
 	[ilog2(ALPHA_IPI_XCALL)] =	"xcall ipi",
@@ -156,7 +149,7 @@ alpha_ipi_process(struct cpu_info *ci, s
  * Send an interprocessor interrupt.
  */
 void
-alpha_send_ipi(u_long cpu_id, u_long ipimask)
+alpha_send_ipi(u_long const cpu_id, u_long const ipimask)
 {
 
 	KASSERT(cpu_id < hwrpb->rpb_pcs_cnt);
@@ -171,14 +164,13 @@ alpha_send_ipi(u_long cpu_id, u_long ipi
  * Broadcast an IPI to all but ourselves.
  */
 void
-alpha_broadcast_ipi(u_long ipimask)
+alpha_broadcast_ipi(u_long const ipimask)
 {
 	struct cpu_info *ci;
 	CPU_INFO_ITERATOR cii;
-	u_long cpu_id = cpu_number();
-	u_long cpumask;
 
-	cpumask = cpus_running & ~(1UL << cpu_id);
+	const u_long cpu_id = cpu_number();
+	const u_long cpumask = cpus_running & ~(1UL << cpu_id);
 
 	for (CPU_INFO_FOREACH(cii, ci)) {
 		if ((cpumask & (1UL << ci->ci_cpuid)) == 0)
@@ -191,7 +183,7 @@ alpha_broadcast_ipi(u_long ipimask)
  * Send an IPI to all in the list but ourselves.
  */
 void
-alpha_multicast_ipi(u_long cpumask, u_long ipimask)
+alpha_multicast_ipi(u_long cpumask, u_long const ipimask)
 {
 	struct cpu_info *ci;
 	CPU_INFO_ITERATOR cii;
@@ -209,10 +201,11 @@ alpha_multicast_ipi(u_long cpumask, u_lo
 }
 
 static void
-alpha_ipi_halt(struct cpu_info *ci, struct trapframe *framep)
+alpha_ipi_halt(struct cpu_info * const ci,
+    struct trapframe * const framep __unused)
 {
-	u_long cpu_id = ci->ci_cpuid;
-	u_long wait_mask = (1UL << cpu_id);
+	const u_long cpu_id = ci->ci_cpuid;
+	const u_long wait_mask = (1UL << cpu_id);
 
 	/* Disable interrupts. */
 	(void) splhigh();
@@ -242,21 +235,16 @@ alpha_ipi_halt(struct cpu_info *ci, stru
 }
 
 static void
-alpha_ipi_microset(struct cpu_info *ci, struct trapframe *framep)
+alpha_ipi_microset(struct cpu_info * const ci,
+    struct trapframe * const framep __unused)
 {
 
 	cc_calibrate_cpu(ci);
 }
 
 static void
-alpha_ipi_imb(struct cpu_info *ci, struct trapframe *framep)
-{
-
-	alpha_pal_imb();
-}
-
-static void
-alpha_ipi_ast(struct cpu_info *ci, struct trapframe *framep)
+alpha_ipi_ast(struct cpu_info * const ci,
+    struct trapframe * const framep __unused)
 {
 
 	if (ci->ci_onproc != ci->ci_data.cpu_idlelwp)
@@ -264,16 +252,16 @@ alpha_ipi_ast(struct cpu_info *ci, struc
 }
 
 static void
-alpha_ipi_pause(struct cpu_info *ci, struct trapframe *framep)
+alpha_ipi_pause(struct cpu_info * const ci, struct trapframe * const framep)
 {
-	u_long cpumask = (1UL << ci->ci_cpuid);
+	const u_long cpumask = (1UL << ci->ci_cpuid);
 	int s;
 
 	s = splhigh();
 
 	/* Point debuggers at our trapframe for register state. */
 	ci->ci_db_regs = framep;
-
+	alpha_wmb();
 	atomic_or_ulong(&ci->ci_flags, CPUF_PAUSED);
 
 	/* Spin with interrupts disabled until we're resumed. */
@@ -282,12 +270,13 @@ alpha_ipi_pause(struct cpu_info *ci, str
 	} while (cpus_paused & cpumask);
 
 	atomic_and_ulong(&ci->ci_flags, ~CPUF_PAUSED);
-
+	alpha_wmb();
 	ci->ci_db_regs = NULL;
 
 	splx(s);
 
-	/* Do an IMB on the way out, in case the kernel text was changed. */
+	/* Do a TBIA+IMB on the way out, in case things have changed. */
+	ALPHA_TBIA();
 	alpha_pal_imb();
 }
 
@@ -296,13 +285,14 @@ alpha_ipi_pause(struct cpu_info *ci, str
  */
 
 static void
-alpha_ipi_xcall(struct cpu_info *ci, struct trapframe *framep)
+alpha_ipi_xcall(struct cpu_info * const ci __unused,
+    struct trapframe * const framep __unused)
 {
 	xc_ipi_handler();
 }
 
 void
-xc_send_ipi(struct cpu_info *ci)
+xc_send_ipi(struct cpu_info * const ci)
 {
 	KASSERT(kpreempt_disabled());
 	KASSERT(curcpu() != ci);
@@ -317,13 +307,14 @@ xc_send_ipi(struct cpu_info *ci)
 }
 
 static void
-alpha_ipi_generic(struct cpu_info *ci, struct trapframe *framep)
+alpha_ipi_generic(struct cpu_info * const ci __unused,
+    struct trapframe * const framep __unused)
 {
 	ipi_cpu_handler();
 }
 
 void
-cpu_ipi(struct cpu_info *ci)
+cpu_ipi(struct cpu_info * const ci)
 {
 	KASSERT(kpreempt_disabled());
 	KASSERT(curcpu() != ci);

Index: src/sys/arch/alpha/alpha/pmap.c
diff -u src/sys/arch/alpha/alpha/pmap.c:1.268 src/sys/arch/alpha/alpha/pmap.c:1.269
--- src/sys/arch/alpha/alpha/pmap.c:1.268	Mon Aug 17 00:57:37 2020
+++ src/sys/arch/alpha/alpha/pmap.c	Sat Aug 29 20:06:59 2020
@@ -1,12 +1,14 @@
-/* $NetBSD: pmap.c,v 1.268 2020/08/17 00:57:37 thorpej Exp $ */
+/* $NetBSD: pmap.c,v 1.269 2020/08/29 20:06:59 thorpej Exp $ */
 
 /*-
- * Copyright (c) 1998, 1999, 2000, 2001, 2007, 2008 The NetBSD Foundation, Inc.
+ * Copyright (c) 1998, 1999, 2000, 2001, 2007, 2008, 2020
+ * 	The NetBSD Foundation, Inc.
  * All rights reserved.
  *
  * This code is derived from software contributed to The NetBSD Foundation
  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
- * NASA Ames Research Center and by Chris G. Demetriou.
+ * NASA Ames Research Center, by Andrew Doran and Mindaugas Rasiukevicius,
+ * and by Chris G. Demetriou.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -93,26 +95,19 @@
  *	The locking protocol was written by Jason R. Thorpe,
  *	using Chuck Cranor's i386 pmap for UVM as a model.
  *
- *	TLB shootdown code was written by Jason R. Thorpe.
+ *	TLB shootdown code was written (and then subsequently
+ *	rewritten some years later, borrowing some ideas from
+ *	the x86 pmap) by Jason R. Thorpe.
  *
- *	Multiprocessor modifications by Andrew Doran.
+ *	Multiprocessor modifications by Andrew Doran and
+ *	Jason R. Thorpe.
  *
  * Notes:
  *
- *	All page table access is done via K0SEG.  The one exception
- *	to this is for kernel mappings.  Since all kernel page
- *	tables are pre-allocated, we can use the Virtual Page Table
- *	to access PTEs that map K1SEG addresses.
- *
- *	Kernel page table pages are statically allocated in
- *	pmap_bootstrap(), and are never freed.  In the future,
- *	support for dynamically adding additional kernel page
- *	table pages may be added.  User page table pages are
- *	dynamically allocated and freed.
- *
- * Bugs/misfeatures:
- *
- *	- Some things could be optimized.
+ *	All user page table access is done via K0SEG.  Kernel
+ *	page table access is done via the recursive Virtual Page
+ *	Table becase kernel PT pages are pre-allocated and never
+ *	freed, so no VPT fault handling is requiried.
  */
 
 /*
@@ -140,7 +135,7 @@
 
 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
 
-__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.268 2020/08/17 00:57:37 thorpej Exp $");
+__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.269 2020/08/29 20:06:59 thorpej Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -149,6 +144,7 @@ __KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.2
 #include <sys/malloc.h>
 #include <sys/pool.h>
 #include <sys/buf.h>
+#include <sys/evcnt.h>
 #include <sys/atomic.h>
 #include <sys/cpu.h>
 
@@ -178,12 +174,18 @@ int debugmap = 0;
 int pmapdebug = PDB_PARANOIA;
 #endif
 
+#if defined(MULTIPROCESSOR)
+#define	PMAP_MP(x)	x
+#else
+#define	PMAP_MP(x)	__nothing
+#endif /* MULTIPROCESSOR */
+
 /*
  * Given a map and a machine independent protection code,
  * convert to an alpha protection code.
  */
 #define pte_prot(m, p)	(protection_codes[m == pmap_kernel() ? 0 : 1][p])
-static int	protection_codes[2][8];
+static int	protection_codes[2][8] __read_mostly;
 
 /*
  * kernel_lev1map:
@@ -209,53 +211,63 @@ static int	protection_codes[2][8];
  * level 2 and level 3 page table pages must already be allocated
  * and mapped into the parent page table.
  */
-pt_entry_t	*kernel_lev1map;
+pt_entry_t	*kernel_lev1map __read_mostly;
 
 /*
  * Virtual Page Table.
  */
-static pt_entry_t *VPT;
+static pt_entry_t *VPT __read_mostly;
 
 static struct {
 	struct pmap k_pmap;
-	struct pmap_asn_info k_asni[ALPHA_MAXPROCS];
 } kernel_pmap_store __cacheline_aligned;
 
 struct pmap *const kernel_pmap_ptr = &kernel_pmap_store.k_pmap;
 
-paddr_t    	avail_start;	/* PA of first available physical page */
-paddr_t		avail_end;	/* PA of last available physical page */
-static vaddr_t	virtual_end;	/* VA of last avail page (end of kernel AS) */
+/* PA of first available physical page */
+paddr_t    	avail_start __read_mostly;
+
+/* PA of last available physical page */
+paddr_t		avail_end __read_mostly;
+
+/* VA of last avail page (end of kernel AS) */
+static vaddr_t	virtual_end __read_mostly;
 
-static bool pmap_initialized;	/* Has pmap_init completed? */
+/* Has pmap_init completed? */
+static bool pmap_initialized __read_mostly;
 
-u_long		pmap_pages_stolen;	/* instrumentation */
+/* Instrumentation */
+u_long		pmap_pages_stolen __read_mostly;
 
 /*
  * This variable contains the number of CPU IDs we need to allocate
  * space for when allocating the pmap structure.  It is used to
  * size a per-CPU array of ASN and ASN Generation number.
  */
-static u_long 	pmap_ncpuids;
+static u_long 	pmap_ncpuids __read_mostly;
 
 #ifndef PMAP_PV_LOWAT
 #define	PMAP_PV_LOWAT	16
 #endif
-int		pmap_pv_lowat = PMAP_PV_LOWAT;
+int		pmap_pv_lowat __read_mostly = PMAP_PV_LOWAT;
 
 /*
  * List of all pmaps, used to update them when e.g. additional kernel
  * page tables are allocated.  This list is kept LRU-ordered by
  * pmap_activate().
  */
-static TAILQ_HEAD(, pmap) pmap_all_pmaps;
+static TAILQ_HEAD(, pmap) pmap_all_pmaps __cacheline_aligned;
 
 /*
  * The pools from which pmap structures and sub-structures are allocated.
  */
-static struct pool_cache pmap_pmap_cache;
-static struct pool_cache pmap_l1pt_cache;
-static struct pool_cache pmap_pv_cache;
+static struct pool_cache pmap_pmap_cache __read_mostly;
+static struct pool_cache pmap_l1pt_cache __read_mostly;
+static struct pool_cache pmap_pv_cache __read_mostly;
+
+CTASSERT(offsetof(struct pmap, pm_asni[0]) == COHERENCY_UNIT);
+CTASSERT(PMAP_SIZEOF(ALPHA_MAXPROCS) < ALPHA_PGBYTES);
+CTASSERT(sizeof(struct pmap_asn_info) == COHERENCY_UNIT);
 
 /*
  * Address Space Numbers.
@@ -284,39 +296,22 @@ static struct pool_cache pmap_pv_cache;
  * and pmap_next_asn is changed to indicate the first non-reserved ASN.
  *
  * We reserve ASN #0 for pmaps that use the global kernel_lev1map.  This
- * prevents the following scenario:
- *
- *	* New ASN generation starts, and process A is given ASN #0.
- *
- *	* A new process B (and thus new pmap) is created.  The ASN,
- *	  for lack of a better value, is initialized to 0.
- *
- *	* Process B runs.  It is now using the TLB entries tagged
- *	  by process A.  *poof*
- *
- * In the scenario above, in addition to the processor using using incorrect
- * TLB entires, the PALcode might use incorrect information to service a
- * TLB miss.  (The PALcode uses the recursively mapped Virtual Page Table
- * to locate the PTE for a faulting address, and tagged TLB entires exist
- * for the Virtual Page Table addresses in order to speed up this procedure,
- * as well.)
- *
- * By reserving an ASN for kernel_lev1map users, we are guaranteeing that
- * new pmaps will initially run with no TLB entries for user addresses
- * or VPT mappings that map user page tables.  Since kernel_lev1map only
- * contains mappings for kernel addresses, and since those mappings
- * are always made with PG_ASM, sharing an ASN for kernel_lev1map users is
- * safe (since PG_ASM mappings match any ASN).
+ * prevents the following scenario to ensure no accidental accesses to
+ * user space for LWPs using the kernel pmap.  This is important because
+ * the PALcode may use the recursive VPT to service TLB misses.
+ *
+ * By reserving an ASN for the kernel, we are guaranteeing that an lwp
+ * will not see any valid user space TLB entries until it passes through
+ * pmap_activate() for the first time.
  *
  * On processors that do not support ASNs, the PALcode invalidates
- * the TLB and I-cache automatically on swpctx.  We still still go
- * through the motions of assigning an ASN (really, just refreshing
- * the ASN generation in this particular case) to keep the logic sane
- * in other parts of the code.
- */
-static u_int	pmap_max_asn;		/* max ASN supported by the system */
-					/* next ASN and cur ASN generation */
-static struct pmap_asn_info pmap_asn_info[ALPHA_MAXPROCS];
+ * non-ASM TLB entries automatically on swpctx.  We completely skip
+ * the ASN machinery in this case because the PALcode neither reads
+ * nor writes that field of the HWPCB.
+ */
+
+/* max ASN supported by the system */
+static u_int	pmap_max_asn __read_mostly;
 
 /*
  * Locking:
@@ -338,20 +333,22 @@ static struct pmap_asn_info pmap_asn_inf
  *	MUTEXES
  *	-------
  *
- *	* pm_lock (per-pmap) - This lock protects all of the members
- *	  of the pmap structure itself.  This lock will be asserted
- *	  in pmap_activate() and pmap_deactivate() from a critical
- *	  section of mi_switch(), and must never sleep.  Note that
- *	  in the case of the kernel pmap, interrupts which cause
- *	  memory allocation *must* be blocked while this lock is
- *	  asserted.
+ *	* pmap lock (global hash) - These locks protect the pmap structures.
+ *
+ *	* pmap activation lock (global hash) - These IPL_SCHED spin locks
+ *	  synchronize pmap_activate() and TLB shootdowns.  This has a lock
+ *	  ordering constraint with the tlb_lock:
  *
- *	* pvh_lock (global hash) - These locks protects the PV lists
- *	  for managed pages.
+ *		tlb_lock -> pmap activation lock
+ *
+ *	* pvh_lock (global hash) - These locks protect the PV lists for
+ *	  managed pages.
+ *
+ *	* tlb_lock - This IPL_VM lock serializes local and remote TLB
+ *	  invalidation.
  *
  *	* pmap_all_pmaps_lock - This lock protects the global list of
- *	  all pmaps.  Note that a pm_lock must never be held while this
- *	  lock is held.
+ *	  all pmaps.
  *
  *	* pmap_growkernel_lock - This lock protects pmap_growkernel()
  *	  and the virtual_end variable.
@@ -360,7 +357,7 @@ static struct pmap_asn_info pmap_asn_inf
  *	  pmap_growkernel() acquires the locks in the following order:
  *
  *		pmap_growkernel_lock (write) -> pmap_all_pmaps_lock ->
- *		    pmap->pm_lock
+ *		    pmap lock
  *
  *	  We need to ensure consistency between user pmaps and the
  *	  kernel_lev1map.  For this reason, pmap_growkernel_lock must
@@ -375,9 +372,9 @@ static struct pmap_asn_info pmap_asn_inf
  *	with the pmap already locked by the caller (which will be
  *	an interface function).
  */
-static krwlock_t pmap_main_lock;
-static kmutex_t pmap_all_pmaps_lock;
-static krwlock_t pmap_growkernel_lock;
+static krwlock_t pmap_main_lock __cacheline_aligned;
+static kmutex_t pmap_all_pmaps_lock __cacheline_aligned;
+static krwlock_t pmap_growkernel_lock __cacheline_aligned;
 
 #define	PMAP_MAP_TO_HEAD_LOCK()		rw_enter(&pmap_main_lock, RW_READER)
 #define	PMAP_MAP_TO_HEAD_UNLOCK()	rw_exit(&pmap_main_lock)
@@ -398,62 +395,690 @@ pmap_pvh_lock(struct vm_page *pg)
 	return &pmap_pvh_locks[PVH_LOCK_HASH(pg)].lock;
 }
 
+static union {
+	struct {
+		kmutex_t	lock;
+		kmutex_t	activation_lock;
+	} locks;
+	uint8_t		pad[COHERENCY_UNIT];
+} pmap_pmap_locks[64] __cacheline_aligned;
+
+#define	PMAP_LOCK_HASH(pm)						\
+	((((uintptr_t)(pm)) >> 6) & 63)
+
+static inline kmutex_t *
+pmap_pmap_lock(pmap_t const pmap)
+{
+	return &pmap_pmap_locks[PMAP_LOCK_HASH(pmap)].locks.lock;
+}
+
+static inline kmutex_t *
+pmap_activation_lock(pmap_t const pmap)
+{
+	return &pmap_pmap_locks[PMAP_LOCK_HASH(pmap)].locks.activation_lock;
+}
+
+#define	PMAP_LOCK(pmap)		mutex_enter(pmap_pmap_lock(pmap))
+#define	PMAP_UNLOCK(pmap)	mutex_exit(pmap_pmap_lock(pmap))
+
+#define	PMAP_ACT_LOCK(pmap)	mutex_spin_enter(pmap_activation_lock(pmap))
+#define	PMAP_ACT_TRYLOCK(pmap)	mutex_tryenter(pmap_activation_lock(pmap))
+#define	PMAP_ACT_UNLOCK(pmap)	mutex_spin_exit(pmap_activation_lock(pmap))
+
 #if defined(MULTIPROCESSOR)
+#define	pmap_all_cpus()		cpus_running
+#else
+#define	pmap_all_cpus()		~0UL
+#endif /* MULTIPROCESSOR */
+
 /*
- * TLB Shootdown:
+ * TLB management.
  *
- * When a mapping is changed in a pmap, the TLB entry corresponding to
- * the virtual address must be invalidated on all processors.  In order
- * to accomplish this on systems with multiple processors, messages are
- * sent from the processor which performs the mapping change to all
- * processors on which the pmap is active.  For other processors, the
- * ASN generation numbers for that processor is invalidated, so that
- * the next time the pmap is activated on that processor, a new ASN
- * will be allocated (which implicitly invalidates all TLB entries).
- *
- * Note, we can use the pool allocator to allocate job entries
- * since pool pages are mapped with K0SEG, not with the TLB.
- */
-struct pmap_tlb_shootdown_job {
-	TAILQ_ENTRY(pmap_tlb_shootdown_job) pj_list;
-	vaddr_t pj_va;			/* virtual address */
-	pmap_t pj_pmap;			/* the pmap which maps the address */
-	pt_entry_t pj_pte;		/* the PTE bits */
+ * TLB invalidations need to be performed on local and remote CPUs
+ * whenever parts of the PTE that the hardware or PALcode understands
+ * changes.  In order amortize the cost of these operations, we will
+ * queue up to 8 addresses to invalidate in a batch.  Any more than
+ * that, and we will hit the entire TLB.
+ 8
+ * Some things that add complexity:
+ *
+ * ==> ASNs. A CPU may have valid TLB entries for other than the current
+ *     address spaace.  We can only invalidate TLB entries for the current
+ *     address space, so when asked to invalidate a VA for the non-current
+ *     pmap on a given CPU, we simply invalidate the ASN for that pmap,CPU
+ *     tuple so that new one is allocated on the next activation on that
+ *     CPU.  N.B. that for CPUs that don't implement ASNs, SWPCTX does all
+ *     the work necessary, so we can skip some work in the pmap module
+ *     itself.
+ *
+ *     When a pmap is activated on a given CPU, we set a corresponding
+ *     bit in pmap::pm_cpus, indicating that it potentially has valid
+ *     TLB entries for that address space.  This bitmap is then used to
+ *     determine which remote CPUs need to be notified of invalidations.
+ *     The bit is cleared when the ASN is invalidated on that CPU.
+ *
+ *     In order to serialize with activating an address space on a
+ *     given CPU (that we can reliably send notifications only to
+ *     relevant remote CPUs), we acquire the pmap lock in pmap_activate()
+ *     and also hold the lock while remote shootdowns take place.
+ *     This does not apply to the kernel pmap; all CPUs are notified about
+ *     invalidations for the kernel pmap, and the pmap lock is not held
+ *     in pmap_activate() for the kernel pmap.
+ *
+ * ==> P->V operations (e.g. pmap_page_protect()) may require sending
+ *     invalidations for multiple address spaces.  We only track one
+ *     address space at a time, and if we encounter more than one, then
+ *     the notification each CPU gets is to hit the entire TLB.  Note
+ *     also that we can't serialize with pmap_activate() in this case,
+ *     so all CPUs will get the notification, and they check when
+ *     processing the notification if the pmap is current on that CPU.
+ *
+ * Invalidation information is gathered into a pmap_tlb_context structure
+ * that includes room for 8 VAs, the pmap the VAs belong to, a bitmap of
+ * CPUs to be notified, and a list for PT pages that are freed during
+ * removal off mappings.  The number of valid addresses in the list as
+ * well as flags are sqeezed into the lower bits of the first two VAs.
+ * Storage for this structure is allocated on the stack.  We need to be
+ * careful to keep the size of this struture under control.
+ *
+ * When notifying remote CPUs, we acquire the tlb_lock (which also
+ * blocks IPIs), record the pointer to our context structure, set a
+ * global bitmap off CPUs to be notified, and then send the IPIs to
+ * each victim.  While the other CPUs are in-flight, we then perform
+ * any invalidations necessary on the local CPU.  Once that is done,
+ * we then wait the the global context pointer to be cleared, which
+ * will be done by the final remote CPU to complete their work. This
+ * method reduces cache line contention during pocessing.
+ *
+ * When removing mappings in user pmaps, this implemention frees page
+ * table pages back to the VM system once they contain no valid mappings.
+ * As we do this, we must ensure to invalidate TLB entries that the
+ * CPU might hold for the respective recursive VPT mappings.  This must
+ * be done whenever an L1 or L2 PTE is invalidated.  Until these VPT
+ * translations are invalidated, the PT pages must not be reused.  For
+ * this reason, we keep a list of freed PT pages in the context stucture
+ * and drain them off once all invalidations are complete.
+ *
+ * NOTE: The value of TLB_CTX_MAXVA is tuned to accommodate the UBC
+ * window size (defined as 64KB on alpha in <machine/vmparam.h>).
+ */
+
+#define	TLB_CTX_MAXVA		8
+#define	TLB_CTX_ALLVA		PAGE_MASK
+
+#define	TLB_CTX_F_ASM		__BIT(0)
+#define	TLB_CTX_F_IMB		__BIT(1)
+#define	TLB_CTX_F_KIMB		__BIT(2)
+#define	TLB_CTX_F_PV		__BIT(3)
+#define	TLB_CTX_F_MULTI		__BIT(4)
+
+#define	TLB_CTX_COUNT(ctx)	((ctx)->t_addrdata[0] & PAGE_MASK)
+#define	TLB_CTX_INC_COUNT(ctx)	 (ctx)->t_addrdata[0]++
+#define	TLB_CTX_SET_ALLVA(ctx)	 (ctx)->t_addrdata[0] |= TLB_CTX_ALLVA
+
+#define	TLB_CTX_FLAGS(ctx)	((ctx)->t_addrdata[1] & PAGE_MASK)
+#define	TLB_CTX_SET_FLAG(ctx, f) (ctx)->t_addrdata[1] |= (f)
+
+#define	TLB_CTX_VA(ctx, i)	((ctx)->t_addrdata[(i)] & ~PAGE_MASK)
+#define	TLB_CTX_SETVA(ctx, i, va)					\
+	(ctx)->t_addrdata[(i)] = (va) | ((ctx)->t_addrdata[(i)] & PAGE_MASK)
+
+struct pmap_tlb_context {
+	uintptr_t	t_addrdata[TLB_CTX_MAXVA];
+	pmap_t		t_pmap;
+	LIST_HEAD(, vm_page) t_freeptq;
 };
 
-static struct pmap_tlb_shootdown_q {
-	TAILQ_HEAD(, pmap_tlb_shootdown_job) pq_head;	/* queue 16b */
-	kmutex_t pq_lock;		/* spin lock on queue 16b */
-	int pq_pte;			/* aggregate PTE bits 4b */
-	int pq_count;			/* number of pending requests 4b */
-	int pq_tbia;			/* pending global flush 4b */
-	uint8_t pq_pad[64-16-16-4-4-4];	/* pad to 64 bytes */
-} pmap_tlb_shootdown_q[ALPHA_MAXPROCS] __aligned(CACHE_LINE_SIZE);
+static struct {
+	kmutex_t	lock;
+	struct evcnt	events;
+} tlb_shootdown __cacheline_aligned;
+#define	tlb_lock	tlb_shootdown.lock
+#define	tlb_evcnt	tlb_shootdown.events
+#if defined(MULTIPROCESSOR)
+static const struct pmap_tlb_context *tlb_context __cacheline_aligned;
+static unsigned long tlb_pending __cacheline_aligned;
+#endif /* MULTIPROCESSOR */
+
+#if defined(TLB_STATS)
+#define	TLB_COUNT_DECL(cnt)	static struct evcnt tlb_stat_##cnt
+#define	TLB_COUNT(cnt)		atomic_inc_64(&tlb_stat_##cnt .ev_count)
+#define	TLB_COUNT_ATTACH(cnt)						\
+	evcnt_attach_dynamic_nozero(&tlb_stat_##cnt, EVCNT_TYPE_MISC,	\
+	    NULL, "TLB", #cnt)
+
+TLB_COUNT_DECL(invalidate_multi_tbia);
+TLB_COUNT_DECL(invalidate_multi_tbiap);
+TLB_COUNT_DECL(invalidate_multi_imb);
+
+TLB_COUNT_DECL(invalidate_kern_tbia);
+TLB_COUNT_DECL(invalidate_kern_tbis);
+TLB_COUNT_DECL(invalidate_kern_imb);
+
+TLB_COUNT_DECL(invalidate_user_not_current);
+TLB_COUNT_DECL(invalidate_user_lazy_imb);
+TLB_COUNT_DECL(invalidate_user_tbiap);
+TLB_COUNT_DECL(invalidate_user_tbis);
+
+TLB_COUNT_DECL(shootdown_kernel);
+TLB_COUNT_DECL(shootdown_user);
+TLB_COUNT_DECL(shootdown_imb);
+TLB_COUNT_DECL(shootdown_kimb);
+TLB_COUNT_DECL(shootdown_overflow);
+
+TLB_COUNT_DECL(shootdown_all_user);
+TLB_COUNT_DECL(shootdown_all_user_imb);
+
+TLB_COUNT_DECL(shootdown_pv);
+TLB_COUNT_DECL(shootdown_pv_multi);
+
+TLB_COUNT_DECL(shootnow_over_notify);
+TLB_COUNT_DECL(shootnow_remote);
+
+TLB_COUNT_DECL(reason_remove_kernel);
+TLB_COUNT_DECL(reason_remove_user);
+TLB_COUNT_DECL(reason_page_protect_read);
+TLB_COUNT_DECL(reason_page_protect_none);
+TLB_COUNT_DECL(reason_protect);
+TLB_COUNT_DECL(reason_enter_kernel);
+TLB_COUNT_DECL(reason_enter_user);
+TLB_COUNT_DECL(reason_kenter);
+TLB_COUNT_DECL(reason_enter_l2pt_delref);
+TLB_COUNT_DECL(reason_enter_l3pt_delref);
+TLB_COUNT_DECL(reason_kremove);
+TLB_COUNT_DECL(reason_clear_modify);
+TLB_COUNT_DECL(reason_clear_reference);
+TLB_COUNT_DECL(reason_emulate_reference);
+
+TLB_COUNT_DECL(asn_reuse);
+TLB_COUNT_DECL(asn_newgen);
+TLB_COUNT_DECL(asn_assign);
+
+TLB_COUNT_DECL(activate_both_change);
+TLB_COUNT_DECL(activate_asn_change);
+TLB_COUNT_DECL(activate_ptbr_change);
+TLB_COUNT_DECL(activate_swpctx);
+TLB_COUNT_DECL(activate_skip_swpctx);
+
+#else /* ! TLB_STATS */
+#define	TLB_COUNT(cnt)		__nothing
+#define	TLB_COUNT_ATTACH(cnt)	__nothing
+#endif /* TLB_STATS */
+
+static void
+pmap_tlb_init(void)
+{
+	/* mutex is initialized in pmap_bootstrap(). */
+
+	evcnt_attach_dynamic_nozero(&tlb_evcnt, EVCNT_TYPE_MISC,
+	    NULL, "TLB", "shootdown");
+
+	TLB_COUNT_ATTACH(invalidate_multi_tbia);
+	TLB_COUNT_ATTACH(invalidate_multi_tbiap);
+	TLB_COUNT_ATTACH(invalidate_multi_imb);
+
+	TLB_COUNT_ATTACH(invalidate_kern_tbia);
+	TLB_COUNT_ATTACH(invalidate_kern_tbis);
+	TLB_COUNT_ATTACH(invalidate_kern_imb);
+
+	TLB_COUNT_ATTACH(invalidate_user_not_current);
+	TLB_COUNT_ATTACH(invalidate_user_lazy_imb);
+	TLB_COUNT_ATTACH(invalidate_user_tbiap);
+	TLB_COUNT_ATTACH(invalidate_user_tbis);
+
+	TLB_COUNT_ATTACH(shootdown_kernel);
+	TLB_COUNT_ATTACH(shootdown_user);
+	TLB_COUNT_ATTACH(shootdown_imb);
+	TLB_COUNT_ATTACH(shootdown_kimb);
+	TLB_COUNT_ATTACH(shootdown_overflow);
+
+	TLB_COUNT_ATTACH(shootdown_all_user);
+	TLB_COUNT_ATTACH(shootdown_all_user_imb);
+
+	TLB_COUNT_ATTACH(shootdown_pv);
+	TLB_COUNT_ATTACH(shootdown_pv_multi);
+
+	TLB_COUNT_ATTACH(shootnow_over_notify);
+	TLB_COUNT_ATTACH(shootnow_remote);
+
+	TLB_COUNT_ATTACH(reason_remove_kernel);
+	TLB_COUNT_ATTACH(reason_remove_user);
+	TLB_COUNT_ATTACH(reason_page_protect_read);
+	TLB_COUNT_ATTACH(reason_page_protect_none);
+	TLB_COUNT_ATTACH(reason_protect);
+	TLB_COUNT_ATTACH(reason_enter_kernel);
+	TLB_COUNT_ATTACH(reason_enter_user);
+	TLB_COUNT_ATTACH(reason_kenter);
+	TLB_COUNT_ATTACH(reason_enter_l2pt_delref);
+	TLB_COUNT_ATTACH(reason_enter_l3pt_delref);
+	TLB_COUNT_ATTACH(reason_kremove);
+	TLB_COUNT_ATTACH(reason_clear_modify);
+	TLB_COUNT_ATTACH(reason_clear_reference);
+
+	TLB_COUNT_ATTACH(asn_reuse);
+	TLB_COUNT_ATTACH(asn_newgen);
+	TLB_COUNT_ATTACH(asn_assign);
+
+	TLB_COUNT_ATTACH(activate_both_change);
+	TLB_COUNT_ATTACH(activate_asn_change);
+	TLB_COUNT_ATTACH(activate_ptbr_change);
+	TLB_COUNT_ATTACH(activate_swpctx);
+	TLB_COUNT_ATTACH(activate_skip_swpctx);
+}
+
+static inline void
+pmap_tlb_context_init(struct pmap_tlb_context * const tlbctx)
+{
+	/* Initialize the minimum number of fields. */
+	tlbctx->t_addrdata[0] = 0;
+	tlbctx->t_addrdata[1] = 0;
+	tlbctx->t_pmap = NULL;
+	LIST_INIT(&tlbctx->t_freeptq);
+}
+
+static void
+pmap_tlb_shootdown(pmap_t const pmap, vaddr_t const va,
+    pt_entry_t const pte_bits, struct pmap_tlb_context * const tlbctx)
+{
+	KASSERT(pmap != NULL);
+	KASSERT((va & PAGE_MASK) == 0);
+
+	/*
+	 * Figure out who needs to hear about this, and the scope
+	 * of an all-entries invalidate.
+	 */
+	if (pmap == pmap_kernel()) {
+		TLB_COUNT(shootdown_kernel);
+		KASSERT(pte_bits & PG_ASM);
+		TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_ASM);
+
+		/* Note if an I-stream sync is also needed. */
+		if (pte_bits & PG_EXEC) {
+			TLB_COUNT(shootdown_kimb);
+			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_KIMB);
+		}
+	} else {
+		TLB_COUNT(shootdown_user);
+		KASSERT((pte_bits & PG_ASM) == 0);
+
+		/* Note if an I-stream sync is also needed. */
+		if (pte_bits & PG_EXEC) {
+			TLB_COUNT(shootdown_imb);
+			TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_IMB);
+		}
+	}
+
+	KASSERT(tlbctx->t_pmap == NULL || tlbctx->t_pmap == pmap);
+	tlbctx->t_pmap = pmap;
+
+	/*
+	 * If we're already at the max, just tell each active CPU
+	 * to nail everything.
+	 */
+	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
+	if (count > TLB_CTX_MAXVA) {
+		return;
+	}
+	if (count == TLB_CTX_MAXVA) {
+		TLB_COUNT(shootdown_overflow);
+		TLB_CTX_SET_ALLVA(tlbctx);
+		return;
+	}
+
+	TLB_CTX_SETVA(tlbctx, count, va);
+	TLB_CTX_INC_COUNT(tlbctx);
+}
+
+static void
+pmap_tlb_shootdown_all_user(pmap_t const pmap, pt_entry_t const pte_bits,
+    struct pmap_tlb_context * const tlbctx)
+{
+	KASSERT(pmap != pmap_kernel());
+
+	TLB_COUNT(shootdown_all_user);
+
+	/* Note if an I-stream sync is also needed. */
+	if (pte_bits & PG_EXEC) {
+		TLB_COUNT(shootdown_all_user_imb);
+		TLB_CTX_SET_FLAG(tlbctx, TLB_CTX_F_IMB);
+	}
+
+	TLB_CTX_SET_ALLVA(tlbctx);
+}
+
+static void
+pmap_tlb_shootdown_pv(const pv_entry_t pv, pt_entry_t const pte_bits,
+    struct pmap_tlb_context * const tlbctx)
+{
+	uintptr_t flags = TLB_CTX_F_PV;
+
+	TLB_COUNT(shootdown_pv);
+
+	if (tlbctx->t_pmap == NULL || tlbctx->t_pmap == pv->pv_pmap) {
+		if (tlbctx->t_pmap == NULL) {
+			pmap_reference(pv->pv_pmap);
+		}
+		pmap_tlb_shootdown(pv->pv_pmap, pv->pv_va, pte_bits, tlbctx);
+	} else {
+		TLB_COUNT(shootdown_pv_multi);
+		flags |= TLB_CTX_F_MULTI;
+		if (pv->pv_pmap == pmap_kernel()) {
+			KASSERT(pte_bits & PG_ASM);
+			flags |= TLB_CTX_F_ASM;
+		} else {
+			KASSERT((pte_bits & PG_ASM) == 0);
+		}
+
+		/*
+		 * No need to distinguish between kernel and user IMB
+		 * here; see pmap_tlb_invalidate_multi().
+		 */
+		if (pte_bits & PG_EXEC) {
+			flags |= TLB_CTX_F_IMB;
+		}
+		TLB_CTX_SET_ALLVA(tlbctx);
+	}
+	TLB_CTX_SET_FLAG(tlbctx, flags);
+}
+
+static void
+pmap_tlb_invalidate_multi(const struct pmap_tlb_context * const tlbctx)
+{
+	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) {
+		TLB_COUNT(invalidate_multi_tbia);
+		ALPHA_TBIA();
+	} else {
+		TLB_COUNT(invalidate_multi_tbiap);
+		ALPHA_TBIAP();
+	}
+	if (TLB_CTX_FLAGS(tlbctx) & (TLB_CTX_F_IMB | TLB_CTX_F_KIMB)) {
+		TLB_COUNT(invalidate_multi_imb);
+		alpha_pal_imb();
+	}
+}
+
+static void
+pmap_tlb_invalidate_kernel(const struct pmap_tlb_context * const tlbctx)
+{
+	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
+
+	if (count == TLB_CTX_ALLVA) {
+		TLB_COUNT(invalidate_kern_tbia);
+		ALPHA_TBIA();
+	} else {
+		TLB_COUNT(invalidate_kern_tbis);
+		for (uintptr_t i = 0; i < count; i++) {
+			ALPHA_TBIS(TLB_CTX_VA(tlbctx, i));
+		}
+	}
+	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_KIMB) {
+		TLB_COUNT(invalidate_kern_imb);
+		alpha_pal_imb();
+	}
+}
+
+static void
+pmap_tlb_invalidate(const struct pmap_tlb_context * const tlbctx,
+    const struct cpu_info * const ci)
+{
+	const uintptr_t count = TLB_CTX_COUNT(tlbctx);
 
-/* If we have more pending jobs than this, we just nail the whole TLB. */
-#define	PMAP_TLB_SHOOTDOWN_MAXJOBS	6
+	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_MULTI) {
+		pmap_tlb_invalidate_multi(tlbctx);
+		return;
+	}
 
-static struct pool_cache pmap_tlb_shootdown_job_cache;
+	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) {
+		pmap_tlb_invalidate_kernel(tlbctx);
+		return;
+	}
+
+	KASSERT(kpreempt_disabled());
+
+	pmap_t const pmap = tlbctx->t_pmap;
+	KASSERT(pmap != NULL);
+
+	const u_long cpu_mask = 1UL << ci->ci_cpuid;
+
+	if (__predict_false(pmap != ci->ci_pmap)) {
+		TLB_COUNT(invalidate_user_not_current);
+
+		/*
+		 * For CPUs that don't implement ASNs, the SWPCTX call
+		 * does all of the TLB invalidation work for us.
+		 */
+		if (__predict_false(pmap_max_asn == 0)) {
+			return;
+		}
+
+		/*
+		 * We cannot directly invalidate the TLB in this case,
+		 * so force allocation of a new ASN when the pmap becomes
+		 * active again.
+		 */
+		pmap->pm_asni[ci->ci_cpuid].pma_asngen = PMAP_ASNGEN_INVALID;
+		atomic_and_ulong(&pmap->pm_cpus, ~cpu_mask);
+
+		/*
+		 * This isn't strictly necessary; when we allocate a
+		 * new ASN, we're going to clear this bit and skip
+		 * syncing the I-stream.  But we will keep this bit
+		 * of accounting for internal consistency.
+		 */
+		if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_IMB) {
+			atomic_or_ulong(&pmap->pm_needisync, cpu_mask);
+		}
+		return;
+	}
+
+	if (TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_IMB) {
+		TLB_COUNT(invalidate_user_lazy_imb);
+		atomic_or_ulong(&pmap->pm_needisync, cpu_mask);
+	}
+
+	if (count == TLB_CTX_ALLVA) {
+		/*
+		 * Another option here for CPUs that implement ASNs is
+		 * to allocate a new ASN and do a SWPCTX.  That's almost
+		 * certainly faster than a TBIAP, but would require us
+		 * to synchronize against IPIs in pmap_activate().
+		 */
+		TLB_COUNT(invalidate_user_tbiap);
+		KASSERT((TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_ASM) == 0);
+		ALPHA_TBIAP();
+	} else {
+		TLB_COUNT(invalidate_user_tbis);
+		for (uintptr_t i = 0; i < count; i++) {
+			ALPHA_TBIS(TLB_CTX_VA(tlbctx, i));
+		}
+	}
+}
+
+static void
+pmap_tlb_shootnow(const struct pmap_tlb_context * const tlbctx)
+{
+
+	if (TLB_CTX_COUNT(tlbctx) == 0) {
+		/* No work to do. */
+		return;
+	}
+
+	/*
+	 * Acquire the shootdown mutex.  This will also block IPL_VM
+	 * interrupts and disable preemption.  It is critically important
+	 * that IPIs not be blocked in this routine.
+	 */
+	KASSERT((alpha_pal_rdps() & ALPHA_PSL_IPL_MASK) < ALPHA_PSL_IPL_CLOCK);
+	mutex_spin_enter(&tlb_lock);
+	tlb_evcnt.ev_count++;
+
+	const struct cpu_info *ci = curcpu();
+	const u_long this_cpu = 1UL << ci->ci_cpuid;
+	u_long active_cpus;
+	bool activation_locked;
+
+	/*
+	 * Figure out who to notify.  If it's for the kernel or
+	 * multiple aaddress spaces, we notify everybody.  If
+	 * it's a single user pmap, then we try to acquire the
+	 * activation lock so we can get an accurate accounting
+	 * of who needs to be notified.  If we can't acquire
+	 * the activation lock, then just notify everyone and
+	 * let them sort it out when they process the IPI.
+	 */
+	if (TLB_CTX_FLAGS(tlbctx) & (TLB_CTX_F_ASM | TLB_CTX_F_MULTI)) {
+		active_cpus = pmap_all_cpus();
+		activation_locked = false;
+	} else {
+		KASSERT(tlbctx->t_pmap != NULL);
+		activation_locked = PMAP_ACT_TRYLOCK(tlbctx->t_pmap);
+		if (__predict_true(activation_locked)) {
+			active_cpus = tlbctx->t_pmap->pm_cpus;
+		} else {
+			TLB_COUNT(shootnow_over_notify);
+			active_cpus = pmap_all_cpus();
+		}
+	}
+
+#if defined(MULTIPROCESSOR)
+	/*
+	 * If there are remote CPUs that need to do work, get them
+	 * started now.
+	 */
+	const u_long remote_cpus = active_cpus & ~this_cpu;
+	KASSERT(tlb_context == NULL);
+	if (remote_cpus) {
+		TLB_COUNT(shootnow_remote);
+		tlb_context = tlbctx;
+		tlb_pending = remote_cpus;
+		alpha_wmb();
+		alpha_multicast_ipi(remote_cpus, ALPHA_IPI_SHOOTDOWN);
+	}
+#endif /* MULTIPROCESSOR */
+
+	/*
+	 * Now that the remotes have been notified, release the
+	 * activation lock.
+	 */
+	if (activation_locked) {
+		KASSERT(tlbctx->t_pmap != NULL);
+		PMAP_ACT_UNLOCK(tlbctx->t_pmap);
+	}
+
+	/*
+	 * Do any work that we might need to do.  We don't need to
+	 * synchronize with activation here because we know that
+	 * for the current CPU, activation status will not change.
+	 */
+	if (active_cpus & this_cpu) {
+		pmap_tlb_invalidate(tlbctx, ci);
+	}
+
+#if defined(MULTIPROCESSOR)
+	/* Wait for remote CPUs to finish. */
+	if (remote_cpus) {
+		int backoff = SPINLOCK_BACKOFF_MIN;
+		u_int spins = 0;
+
+		while (atomic_load_relaxed(&tlb_context) != NULL) {
+			SPINLOCK_BACKOFF(backoff);
+			if (spins++ > 0x0fffffff) {
+				alpha_mb();
+				printf("TLB LOCAL MASK  = 0x%016lx\n",
+				    this_cpu);
+				printf("TLB REMOTE MASK = 0x%016lx\n",
+				    remote_cpus);
+				printf("TLB REMOTE PENDING = 0x%016lx\n",
+				    tlb_pending);
+				printf("TLB CONTEXT = %p\n", tlb_context);
+				printf("TLB LOCAL IPL = %lu\n",
+				    alpha_pal_rdps() & ALPHA_PSL_IPL_MASK);
+				panic("pmap_tlb_shootnow");
+			}
+		}
+	}
+	KASSERT(tlb_context == NULL);
+#endif /* MULTIPROCESSOR */
+
+	mutex_spin_exit(&tlb_lock);
+
+	if (__predict_false(TLB_CTX_FLAGS(tlbctx) & TLB_CTX_F_PV)) {
+		/*
+		 * P->V TLB operations may operate on multiple pmaps.
+		 * The shootdown takes a reference on the first pmap it
+		 * encounters, in order to prevent it from disappearing,
+		 * in the hope that we end up with a single-pmap P->V
+		 * operation (instrumentation shows this is not rare).
+		 *
+		 * Once this shootdown is finished globally, we need to
+		 * release this extra reference.
+		 */
+		KASSERT(tlbctx->t_pmap != NULL);
+		pmap_destroy(tlbctx->t_pmap);
+	}
+}
+
+#if defined(MULTIPROCESSOR)
+void
+pmap_tlb_shootdown_ipi(struct cpu_info * const ci,
+    struct trapframe * const tf __unused)
+{
+	KASSERT(tlb_context != NULL);
+	pmap_tlb_invalidate(tlb_context, ci);
+	if (atomic_and_ulong_nv(&tlb_pending, ~(1UL << ci->ci_cpuid)) == 0) {
+		alpha_wmb();
+		atomic_store_relaxed(&tlb_context, NULL);
+	}
+}
 #endif /* MULTIPROCESSOR */
 
+static void
+pmap_tlb_physpage_free(paddr_t const ptpa,
+    struct pmap_tlb_context * const tlbctx)
+{
+	struct vm_page * const pg = PHYS_TO_VM_PAGE(ptpa);
+
+	KASSERT(pg != NULL);
+
+#ifdef DEBUG
+	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
+	KDASSERT(md->pvh_refcnt == 0);
+#endif
+
+	LIST_INSERT_HEAD(&tlbctx->t_freeptq, pg, pageq.list);
+}
+
+static void
+pmap_tlb_ptpage_drain(struct pmap_tlb_context * const tlbctx)
+{
+	struct vm_page *pg;
+
+	while ((pg = LIST_FIRST(&tlbctx->t_freeptq)) != NULL) {
+		LIST_REMOVE(pg, pageq.list);
+		uvm_pagefree(pg);
+	}
+}
+
 /*
  * Internal routines
  */
 static void	alpha_protection_init(void);
-static bool	pmap_remove_mapping(pmap_t, vaddr_t, pt_entry_t *, bool, long,
-				    pv_entry_t *);
-static void	pmap_changebit(struct vm_page *, pt_entry_t, pt_entry_t, long);
+static pt_entry_t pmap_remove_mapping(pmap_t, vaddr_t, pt_entry_t *, bool,
+				      pv_entry_t *,
+				      struct pmap_tlb_context *);
+static void	pmap_changebit(struct vm_page *, pt_entry_t, pt_entry_t,
+			       struct pmap_tlb_context *);
 
 /*
  * PT page management functions.
  */
-static int	pmap_lev1map_create(pmap_t, long);
-static void	pmap_lev1map_destroy(pmap_t, long);
-static int	pmap_ptpage_alloc(pmap_t, pt_entry_t *, int);
-static void	pmap_ptpage_free(pmap_t, pt_entry_t *);
-static void	pmap_l3pt_delref(pmap_t, vaddr_t, pt_entry_t *, long);
-static void	pmap_l2pt_delref(pmap_t, pt_entry_t *, pt_entry_t *, long);
-static void	pmap_l1pt_delref(pmap_t, pt_entry_t *, long);
+static int	pmap_ptpage_alloc(pt_entry_t *, int);
+static void	pmap_ptpage_free(pt_entry_t *, struct pmap_tlb_context *);
+static void	pmap_l3pt_delref(pmap_t, vaddr_t, pt_entry_t *,
+		     struct pmap_tlb_context *);
+static void	pmap_l2pt_delref(pmap_t, pt_entry_t *, pt_entry_t *,
+		     struct pmap_tlb_context *);
+static void	pmap_l1pt_delref(pmap_t, pt_entry_t *);
 
 static void	*pmap_l1pt_alloc(struct pool *, int);
 static void	pmap_l1pt_free(struct pool *, void *);
@@ -488,7 +1113,7 @@ void	pmap_pv_dump(paddr_t);
 /*
  * ASN management functions.
  */
-static void	pmap_asn_alloc(pmap_t, long);
+static u_int	pmap_asn_alloc(pmap_t, struct cpu_info *);
 
 /*
  * Misc. functions.
@@ -498,129 +1123,7 @@ static void	pmap_physpage_free(paddr_t);
 static int	pmap_physpage_addref(void *);
 static int	pmap_physpage_delref(void *);
 
-/*
- * PMAP_ISACTIVE:
- *
- *	Check to see if a pmap is active on the current processor.
- */
-#define	PMAP_ISACTIVE(pm, cpu_id)					\
-	(((pm)->pm_cpus & (1UL << (cpu_id))) != 0)
-
-/*
- * PMAP_ACTIVATE_ASN_SANITY:
- *
- *	DEBUG sanity checks for ASNs within PMAP_ACTIVATE.
- */
-#ifdef DEBUG
-#define	PMAP_ACTIVATE_ASN_SANITY(pmap, cpu_id)				\
-do {									\
-	struct pmap_asn_info *__pma = &(pmap)->pm_asni[(cpu_id)];	\
-	struct pmap_asn_info *__cpma = &pmap_asn_info[(cpu_id)];	\
-									\
-	if ((pmap)->pm_lev1map == kernel_lev1map) {			\
-		/*							\
-		 * This pmap implementation also ensures that pmaps	\
-		 * referencing kernel_lev1map use a reserved ASN	\
-		 * ASN to prevent the PALcode from servicing a TLB	\
-		 * miss	with the wrong PTE.				\
-		 */							\
-		KASSERT(__pma->pma_asn == PMAP_ASN_RESERVED);		\
-	} else {							\
-		KASSERT(__pma->pma_asngen == __cpma->pma_asngen);	\
-		KASSERT(__pma->pma_asn != PMAP_ASN_RESERVED);		\
-	}								\
-} while (/*CONSTCOND*/0)
-#else
-#define	PMAP_ACTIVATE_ASN_SANITY(pmap, cpu_id)	/* nothing */
-#endif
-
-/*
- * PMAP_SET_NEEDISYNC:
- *
- *	Mark that a user pmap needs an I-stream synch on its
- *	way back out to userspace.
- */
-#define	PMAP_SET_NEEDISYNC(pmap)	(pmap)->pm_needisync = ~0UL
-
-/*
- * PMAP_SYNC_ISTREAM:
- *
- *	Synchronize the I-stream for the specified pmap.  For user
- *	pmaps, this is deferred until a process using the pmap returns
- *	to userspace.
- */
-#if defined(MULTIPROCESSOR)
-#define	PMAP_SYNC_ISTREAM_KERNEL()					\
-do {									\
-	alpha_pal_imb();						\
-	alpha_broadcast_ipi(ALPHA_IPI_IMB);				\
-} while (/*CONSTCOND*/0)
-
-#define	PMAP_SYNC_ISTREAM_USER(pmap)					\
-do {									\
-	alpha_multicast_ipi((pmap)->pm_cpus, ALPHA_IPI_AST);		\
-	/* for curcpu, will happen in userret() */			\
-} while (/*CONSTCOND*/0)
-#else
-#define	PMAP_SYNC_ISTREAM_KERNEL()	alpha_pal_imb()
-#define	PMAP_SYNC_ISTREAM_USER(pmap)	/* will happen in userret() */
-#endif /* MULTIPROCESSOR */
-
-#define	PMAP_SYNC_ISTREAM(pmap)						\
-do {									\
-	if ((pmap) == pmap_kernel())					\
-		PMAP_SYNC_ISTREAM_KERNEL();				\
-	else								\
-		PMAP_SYNC_ISTREAM_USER(pmap);				\
-} while (/*CONSTCOND*/0)
-
-/*
- * PMAP_INVALIDATE_ASN:
- *
- *	Invalidate the specified pmap's ASN, so as to force allocation
- *	of a new one the next time pmap_asn_alloc() is called.
- *
- *	NOTE: THIS MUST ONLY BE CALLED IF AT LEAST ONE OF THE FOLLOWING
- *	CONDITIONS ARE true:
- *
- *		(1) The pmap references the global kernel_lev1map.
- *
- *		(2) The pmap is not active on the current processor.
- */
-#define	PMAP_INVALIDATE_ASN(pmap, cpu_id)				\
-do {									\
-	(pmap)->pm_asni[(cpu_id)].pma_asn = PMAP_ASN_RESERVED;		\
-} while (/*CONSTCOND*/0)
-
-/*
- * PMAP_INVALIDATE_TLB:
- *
- *	Invalidate the TLB entry for the pmap/va pair.
- */
-#define	PMAP_INVALIDATE_TLB(pmap, va, hadasm, isactive, cpu_id)		\
-do {									\
-	if ((hadasm) || (isactive)) {					\
-		/*							\
-		 * Simply invalidating the TLB entry and I-cache	\
-		 * works in this case.					\
-		 */							\
-		ALPHA_TBIS((va));					\
-	} else if ((pmap)->pm_asni[(cpu_id)].pma_asngen ==		\
-		   pmap_asn_info[(cpu_id)].pma_asngen) {		\
-		/*							\
-		 * We can't directly invalidate the TLB entry		\
-		 * in this case, so we have to force allocation		\
-		 * of a new ASN the next time this pmap becomes		\
-		 * active.						\
-		 */							\
-		PMAP_INVALIDATE_ASN((pmap), (cpu_id));			\
-	}								\
-		/*							\
-		 * Nothing to do in this case; the next time the	\
-		 * pmap becomes active on this processor, a new		\
-		 * ASN will be allocated anyway.			\
-		 */							\
-} while (/*CONSTCOND*/0)
+static bool	vtophys_internal(vaddr_t, paddr_t *p);
 
 /*
  * PMAP_KERNEL_PTE:
@@ -655,13 +1158,6 @@ do {									\
 #endif
 
 /*
- * PMAP_SET_PTE:
- *
- *	Set a PTE to a specified value.
- */
-#define	PMAP_SET_PTE(ptep, val)	*(ptep) = (val)
-
-/*
  * PMAP_STAT_{INCR,DECR}:
  *
  *	Increment or decrement a pmap statistic.
@@ -670,6 +1166,26 @@ do {									\
 #define	PMAP_STAT_DECR(s, v)	atomic_add_long((unsigned long *)(&(s)), -(v))
 
 /*
+ * pmap_init_cpu:
+ *
+ *	Initilize pmap data in the cpu_info.
+ */
+void
+pmap_init_cpu(struct cpu_info * const ci)
+{
+	pmap_t const pmap = pmap_kernel();
+
+	/* All CPUs start out using the kernel pmap. */
+	atomic_or_ulong(&pmap->pm_cpus, 1UL << ci->ci_cpuid);
+	pmap_reference(pmap);
+	ci->ci_pmap = pmap;
+
+	/* Initialize ASN allocation logic. */
+	ci->ci_next_asn = PMAP_ASN_FIRST_USER;
+	ci->ci_asn_gen = PMAP_ASNGEN_INITIAL;
+}
+
+/*
  * pmap_bootstrap:
  *
  *	Bootstrap the system to run with virtual memory.
@@ -817,8 +1333,8 @@ pmap_bootstrap(paddr_t ptaddr, u_int max
 	 * Initialize the pmap pools and list.
 	 */
 	pmap_ncpuids = ncpuids;
-	pool_cache_bootstrap(&pmap_pmap_cache, PMAP_SIZEOF(pmap_ncpuids), 0,
-	    0, 0, "pmap", NULL, IPL_NONE, NULL, NULL, NULL);
+	pool_cache_bootstrap(&pmap_pmap_cache, PMAP_SIZEOF(pmap_ncpuids),
+	    COHERENCY_UNIT, 0, 0, "pmap", NULL, IPL_NONE, NULL, NULL, NULL);
 	pool_cache_bootstrap(&pmap_l1pt_cache, PAGE_SIZE, 0, 0, 0, "pmapl1pt",
 	    &pmap_l1pt_allocator, IPL_NONE, pmap_l1pt_ctor, NULL, NULL);
 	pool_cache_bootstrap(&pmap_pv_cache, sizeof(struct pv_entry), 0, 0,
@@ -827,14 +1343,8 @@ pmap_bootstrap(paddr_t ptaddr, u_int max
 
 	TAILQ_INIT(&pmap_all_pmaps);
 
-	/*
-	 * Initialize the ASN logic.
-	 */
+	/* Initialize the ASN logic.  See also pmap_init_cpu(). */
 	pmap_max_asn = maxasn;
-	for (i = 0; i < ALPHA_MAXPROCS; i++) {
-		pmap_asn_info[i].pma_asn = 1;
-		pmap_asn_info[i].pma_asngen = 0;
-	}
 
 	/*
 	 * Initialize the locks.
@@ -844,6 +1354,18 @@ pmap_bootstrap(paddr_t ptaddr, u_int max
 	for (i = 0; i < __arraycount(pmap_pvh_locks); i++) {
 		mutex_init(&pmap_pvh_locks[i].lock, MUTEX_DEFAULT, IPL_NONE);
 	}
+	for (i = 0; i < __arraycount(pmap_pvh_locks); i++) {
+		mutex_init(&pmap_pmap_locks[i].locks.lock,
+		    MUTEX_DEFAULT, IPL_NONE);
+		mutex_init(&pmap_pmap_locks[i].locks.activation_lock,
+		    MUTEX_SPIN, IPL_SCHED);
+	}
+	
+	/*
+	 * This must block any interrupt from which a TLB shootdown
+	 * could be issued, but must NOT block IPIs.
+	 */
+	mutex_init(&tlb_lock, MUTEX_SPIN, IPL_VM);
 
 	/*
 	 * Initialize kernel pmap.  Note that all kernel mappings
@@ -855,28 +1377,9 @@ pmap_bootstrap(paddr_t ptaddr, u_int max
 	memset(pmap_kernel(), 0, sizeof(struct pmap));
 	pmap_kernel()->pm_lev1map = kernel_lev1map;
 	pmap_kernel()->pm_count = 1;
-	for (i = 0; i < ALPHA_MAXPROCS; i++) {
-		pmap_kernel()->pm_asni[i].pma_asn = PMAP_ASN_RESERVED;
-		pmap_kernel()->pm_asni[i].pma_asngen =
-		    pmap_asn_info[i].pma_asngen;
-	}
-	mutex_init(&pmap_kernel()->pm_lock, MUTEX_DEFAULT, IPL_NONE);
+	/* Kernel pmap does not have ASN info. */
 	TAILQ_INSERT_TAIL(&pmap_all_pmaps, pmap_kernel(), pm_list);
 
-#if defined(MULTIPROCESSOR)
-	/*
-	 * Initialize the TLB shootdown queues.
-	 */
-	pool_cache_bootstrap(&pmap_tlb_shootdown_job_cache,
-	    sizeof(struct pmap_tlb_shootdown_job), CACHE_LINE_SIZE,
-	     0, PR_LARGECACHE, "pmaptlb", NULL, IPL_VM, NULL, NULL, NULL);
-	for (i = 0; i < ALPHA_MAXPROCS; i++) {
-		TAILQ_INIT(&pmap_tlb_shootdown_q[i].pq_head);
-		mutex_init(&pmap_tlb_shootdown_q[i].pq_lock, MUTEX_DEFAULT,
-		    IPL_SCHED);
-	}
-#endif
-
 	/*
 	 * Set up lwp0's PCB such that the ptbr points to the right place
 	 * and has the kernel pmap's (really unused) ASN.
@@ -884,13 +1387,10 @@ pmap_bootstrap(paddr_t ptaddr, u_int max
 	pcb = lwp_getpcb(&lwp0);
 	pcb->pcb_hw.apcb_ptbr =
 	    ALPHA_K0SEG_TO_PHYS((vaddr_t)kernel_lev1map) >> PGSHIFT;
-	pcb->pcb_hw.apcb_asn = pmap_kernel()->pm_asni[cpu_number()].pma_asn;
+	pcb->pcb_hw.apcb_asn = PMAP_ASN_KERNEL;
 
-	/*
-	 * Mark the kernel pmap `active' on this processor.
-	 */
-	atomic_or_ulong(&pmap_kernel()->pm_cpus,
-	    (1UL << cpu_number()));
+	struct cpu_info * const ci = curcpu();
+	pmap_init_cpu(ci);
 }
 
 #ifdef _PMAP_MAY_USE_PROM_CONSOLE
@@ -1017,6 +1517,9 @@ pmap_init(void)
 	/* initialize protection array */
 	alpha_protection_init();
 
+	/* Initialize TLB handling. */
+	pmap_tlb_init();
+
 	/*
 	 * Set a low water mark on the pv_entry pool, so that we are
 	 * more likely to have these around even in extreme memory
@@ -1065,21 +1568,23 @@ pmap_create(void)
 	pmap = pool_cache_get(&pmap_pmap_cache, PR_WAITOK);
 	memset(pmap, 0, sizeof(*pmap));
 
-	/* Reference the kernel_lev1map until we allocate our own below. */
-	pmap->pm_lev1map = kernel_lev1map;
-
 	pmap->pm_count = 1;
+
+	/*
+	 * There are only kernel mappings at this point; give the pmap
+	 * the kernel ASN.  This will be initialized to correct values
+	 * when the pmap is activated.
+	 */
 	for (i = 0; i < pmap_ncpuids; i++) {
-		pmap->pm_asni[i].pma_asn = PMAP_ASN_RESERVED;
-		/* XXX Locking? */
-		pmap->pm_asni[i].pma_asngen = pmap_asn_info[i].pma_asngen;
+		pmap->pm_asni[i].pma_asn = PMAP_ASN_KERNEL;
+		pmap->pm_asni[i].pma_asngen = PMAP_ASNGEN_INVALID;
 	}
-	mutex_init(&pmap->pm_lock, MUTEX_DEFAULT, IPL_NONE);
 
  try_again:
 	rw_enter(&pmap_growkernel_lock, RW_READER);
 
-	if (pmap_lev1map_create(pmap, cpu_number()) != 0) {
+	pmap->pm_lev1map = pool_cache_get(&pmap_l1pt_cache, PR_NOWAIT);
+	if (__predict_false(pmap->pm_lev1map == NULL)) {
 		rw_exit(&pmap_growkernel_lock);
 		(void) kpause("pmap_create", false, hz >> 2, NULL);
 		goto try_again;
@@ -1109,7 +1614,8 @@ pmap_destroy(pmap_t pmap)
 		printf("pmap_destroy(%p)\n", pmap);
 #endif
 
-	if (atomic_dec_uint_nv(&pmap->pm_count) > 0)
+	PMAP_MP(alpha_mb());
+	if (atomic_dec_ulong_nv(&pmap->pm_count) > 0)
 		return;
 
 	rw_enter(&pmap_growkernel_lock, RW_READER);
@@ -1121,11 +1627,11 @@ pmap_destroy(pmap_t pmap)
 	TAILQ_REMOVE(&pmap_all_pmaps, pmap, pm_list);
 	mutex_exit(&pmap_all_pmaps_lock);
 
-	pmap_lev1map_destroy(pmap, cpu_number());
+	pool_cache_put(&pmap_l1pt_cache, pmap->pm_lev1map);
+	pmap->pm_lev1map = NULL;
 
 	rw_exit(&pmap_growkernel_lock);
 
-	mutex_destroy(&pmap->pm_lock);
 	pool_cache_put(&pmap_pmap_cache, pmap);
 }
 
@@ -1143,7 +1649,8 @@ pmap_reference(pmap_t pmap)
 		printf("pmap_reference(%p)\n", pmap);
 #endif
 
-	atomic_inc_uint(&pmap->pm_count);
+	atomic_inc_ulong(&pmap->pm_count);
+	PMAP_MP(alpha_mb());
 }
 
 /*
@@ -1154,14 +1661,14 @@ pmap_reference(pmap_t pmap)
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
-void
-pmap_remove(pmap_t pmap, vaddr_t sva, vaddr_t eva)
+static void
+pmap_remove_internal(pmap_t pmap, vaddr_t sva, vaddr_t eva,
+    struct pmap_tlb_context * const tlbctx)
 {
 	pt_entry_t *l1pte, *l2pte, *l3pte;
-	pt_entry_t *saved_l1pte, *saved_l2pte, *saved_l3pte;
-	vaddr_t l1eva, l2eva, vptva;
-	bool needisync = false;
-	long cpu_id = cpu_number();
+	pt_entry_t *saved_l2pte, *saved_l3pte;
+	vaddr_t l1eva, l2eva, l3vptva;
+	pt_entry_t pte_bits;
 
 #ifdef DEBUG
 	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
@@ -1183,45 +1690,31 @@ pmap_remove(pmap_t pmap, vaddr_t sva, va
 		while (sva < eva) {
 			l3pte = PMAP_KERNEL_PTE(sva);
 			if (pmap_pte_v(l3pte)) {
-#ifdef DIAGNOSTIC
-				if (uvm_pageismanaged(pmap_pte_pa(l3pte)) &&
-				    pmap_pte_pv(l3pte) == 0)
-					panic("pmap_remove: managed page "
-					    "without PG_PVLIST for 0x%lx",
-					    sva);
-#endif
-				needisync |= pmap_remove_mapping(pmap, sva,
-				    l3pte, true, cpu_id, NULL);
+				pte_bits = pmap_remove_mapping(pmap, sva,
+				    l3pte, true, NULL, tlbctx);
+				pmap_tlb_shootdown(pmap, sva, pte_bits,
+				    tlbctx);
 			}
 			sva += PAGE_SIZE;
 		}
 
-		PMAP_UNLOCK(pmap);
 		PMAP_MAP_TO_HEAD_UNLOCK();
+		PMAP_UNLOCK(pmap);
+		pmap_tlb_shootnow(tlbctx);
+		pmap_tlb_ptpage_drain(tlbctx);
+		TLB_COUNT(reason_remove_kernel);
 
-		if (needisync)
-			PMAP_SYNC_ISTREAM_KERNEL();
 		return;
 	}
 
-#ifdef DIAGNOSTIC
-	if (sva > VM_MAXUSER_ADDRESS || eva > VM_MAXUSER_ADDRESS)
-		panic("pmap_remove: (0x%lx - 0x%lx) user pmap, kernel "
-		    "address range", sva, eva);
-#endif
+	KASSERT(sva < VM_MAXUSER_ADDRESS);
+	KASSERT(eva <= VM_MAXUSER_ADDRESS);
+	KASSERT(pmap->pm_lev1map != kernel_lev1map);
 
 	PMAP_MAP_TO_HEAD_LOCK();
 	PMAP_LOCK(pmap);
 
-	KASSERT(pmap->pm_lev1map != kernel_lev1map);
-
-	saved_l1pte = l1pte = pmap_l1pte(pmap, sva);
-
-	/*
-	 * Add a reference to the L1 table to it won't get
-	 * removed from under us.
-	 */
-	pmap_physpage_addref(saved_l1pte);
+	l1pte = pmap_l1pte(pmap, sva);
 
 	for (; sva < eva; sva = l1eva, l1pte++) {
 		l1eva = alpha_trunc_l1seg(sva) + ALPHA_L1SEG_SIZE;
@@ -1252,18 +1745,20 @@ pmap_remove(pmap_t pmap, vaddr_t sva, va
 					 * gets removed, we need to invalidate
 					 * the VPT TLB entry for it.
 					 */
-					vptva = sva;
+					l3vptva = sva;
 
 					for (; sva < l2eva && sva < eva;
 					     sva += PAGE_SIZE, l3pte++) {
 						if (!pmap_pte_v(l3pte)) {
 							continue;
 						}
-						needisync |=
+						pte_bits =
 						    pmap_remove_mapping(
 							pmap, sva,
 							l3pte, true,
-							cpu_id, NULL);
+							NULL, tlbctx);
+						pmap_tlb_shootdown(pmap,
+						    sva, pte_bits, tlbctx);
 					}
 
 					/*
@@ -1271,8 +1766,8 @@ pmap_remove(pmap_t pmap, vaddr_t sva, va
 					 * table that we added above.  This
 					 * may free the L3 table.
 					 */
-					pmap_l3pt_delref(pmap, vptva,
-					    saved_l3pte, cpu_id);
+					pmap_l3pt_delref(pmap, l3vptva,
+					    saved_l3pte, tlbctx);
 				}
 			}
 
@@ -1280,21 +1775,24 @@ pmap_remove(pmap_t pmap, vaddr_t sva, va
 			 * Remove the reference to the L2 table that we
 			 * added above.  This may free the L2 table.
 			 */
-			pmap_l2pt_delref(pmap, l1pte, saved_l2pte, cpu_id);
+			pmap_l2pt_delref(pmap, l1pte, saved_l2pte, tlbctx);
 		}
 	}
 
-	/*
-	 * Remove the reference to the L1 table that we added above.
-	 * This may free the L1 table.
-	 */
-	pmap_l1pt_delref(pmap, saved_l1pte, cpu_id);
+	PMAP_MAP_TO_HEAD_UNLOCK();
+	PMAP_UNLOCK(pmap);
+	pmap_tlb_shootnow(tlbctx);
+	pmap_tlb_ptpage_drain(tlbctx);
+	TLB_COUNT(reason_remove_user);
+}
 
-	if (needisync)
-		PMAP_SYNC_ISTREAM_USER(pmap);
+void
+pmap_remove(pmap_t pmap, vaddr_t sva, vaddr_t eva)
+{
+	struct pmap_tlb_context tlbctx;
 
-	PMAP_UNLOCK(pmap);
-	PMAP_MAP_TO_HEAD_UNLOCK();
+	pmap_tlb_context_init(&tlbctx);
+	pmap_remove_internal(pmap, sva, eva, &tlbctx);
 }
 
 /*
@@ -1307,12 +1805,11 @@ void
 pmap_page_protect(struct vm_page *pg, vm_prot_t prot)
 {
 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
-	pmap_t pmap;
 	pv_entry_t pv, nextpv;
-	bool needkisync = false;
-	long cpu_id = cpu_number();
+	pt_entry_t opte;
 	kmutex_t *lock;
-	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
+	struct pmap_tlb_context tlbctx;
+
 #ifdef DEBUG
 	paddr_t pa = VM_PAGE_TO_PHYS(pg);
 
@@ -1322,6 +1819,8 @@ pmap_page_protect(struct vm_page *pg, vm
 		printf("pmap_page_protect(%p, %x)\n", pg, prot);
 #endif
 
+	pmap_tlb_context_init(&tlbctx);
+
 	switch (prot) {
 	case VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE:
 	case VM_PROT_READ|VM_PROT_WRITE:
@@ -1335,19 +1834,18 @@ pmap_page_protect(struct vm_page *pg, vm
 		mutex_enter(lock);
 		for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next) {
 			PMAP_LOCK(pv->pv_pmap);
-			if (*pv->pv_pte & (PG_KWE | PG_UWE)) {
-				*pv->pv_pte &= ~(PG_KWE | PG_UWE);
-				PMAP_INVALIDATE_TLB(pv->pv_pmap, pv->pv_va,
-				    pmap_pte_asm(pv->pv_pte),
-				    PMAP_ISACTIVE(pv->pv_pmap, cpu_id), cpu_id);
-				PMAP_TLB_SHOOTDOWN(pv->pv_pmap, pv->pv_va,
-				    pmap_pte_asm(pv->pv_pte));
+			opte = atomic_load_relaxed(pv->pv_pte);
+			if (opte & (PG_KWE | PG_UWE)) {
+				atomic_store_relaxed(pv->pv_pte,
+				    opte & ~(PG_KWE | PG_UWE));
+				pmap_tlb_shootdown_pv(pv, opte, &tlbctx);
 			}
 			PMAP_UNLOCK(pv->pv_pmap);
 		}
 		mutex_exit(lock);
 		PMAP_HEAD_TO_MAP_UNLOCK();
-		PMAP_TLB_SHOOTNOW();
+		pmap_tlb_shootnow(&tlbctx);
+		TLB_COUNT(reason_page_protect_read);
 		return;
 
 	/* remove_all */
@@ -1359,30 +1857,21 @@ pmap_page_protect(struct vm_page *pg, vm
 	lock = pmap_pvh_lock(pg);
 	mutex_enter(lock);
 	for (pv = md->pvh_list; pv != NULL; pv = nextpv) {
+		pt_entry_t pte_bits;
+
 		nextpv = pv->pv_next;
-		pmap = pv->pv_pmap;
 
-		PMAP_LOCK(pmap);
-#ifdef DEBUG
-		if (pmap_pte_v(pmap_l2pte(pv->pv_pmap, pv->pv_va, NULL)) == 0 ||
-		    pmap_pte_pa(pv->pv_pte) != pa)
-			panic("pmap_page_protect: bad mapping");
-#endif
-		if (pmap_remove_mapping(pmap, pv->pv_va, pv->pv_pte,
-		    false, cpu_id, NULL)) {
-			if (pmap == pmap_kernel())
-				needkisync |= true;
-			else
-				PMAP_SYNC_ISTREAM_USER(pmap);
-		}
-		PMAP_UNLOCK(pmap);
+		PMAP_LOCK(pv->pv_pmap);
+		pte_bits = pmap_remove_mapping(pv->pv_pmap, pv->pv_va,
+		    pv->pv_pte, false, NULL, &tlbctx);
+		pmap_tlb_shootdown_pv(pv, pte_bits, &tlbctx);
+		PMAP_UNLOCK(pv->pv_pmap);
 	}
-
-	if (needkisync)
-		PMAP_SYNC_ISTREAM_KERNEL();
-
 	mutex_exit(lock);
 	PMAP_HEAD_TO_MAP_UNLOCK();
+	pmap_tlb_shootnow(&tlbctx);
+	pmap_tlb_ptpage_drain(&tlbctx);
+	TLB_COUNT(reason_page_protect_none);
 }
 
 /*
@@ -1394,12 +1883,9 @@ pmap_page_protect(struct vm_page *pg, vm
 void
 pmap_protect(pmap_t pmap, vaddr_t sva, vaddr_t eva, vm_prot_t prot)
 {
-	pt_entry_t *l1pte, *l2pte, *l3pte, bits;
-	bool isactive;
-	bool hadasm;
+	pt_entry_t *l1pte, *l2pte, *l3pte, opte;
 	vaddr_t l1eva, l2eva;
-	long cpu_id = cpu_number();
-	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
+	struct pmap_tlb_context tlbctx;
 
 #ifdef DEBUG
 	if (pmapdebug & (PDB_FOLLOW|PDB_PROTECT))
@@ -1407,15 +1893,16 @@ pmap_protect(pmap_t pmap, vaddr_t sva, v
 		    pmap, sva, eva, prot);
 #endif
 
+	pmap_tlb_context_init(&tlbctx);
+
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
-		pmap_remove(pmap, sva, eva);
+		pmap_remove_internal(pmap, sva, eva, &tlbctx);
 		return;
 	}
 
-	PMAP_LOCK(pmap);
+	const pt_entry_t bits = pte_prot(pmap, prot);
 
-	bits = pte_prot(pmap, prot);
-	isactive = PMAP_ISACTIVE(pmap, cpu_id);
+	PMAP_LOCK(pmap);
 
 	l1pte = pmap_l1pte(pmap, sva);
 	for (; sva < eva; sva = l1eva, l1pte++) {
@@ -1431,18 +1918,12 @@ pmap_protect(pmap_t pmap, vaddr_t sva, v
 					     sva += PAGE_SIZE, l3pte++) {
 						if (pmap_pte_v(l3pte) &&
 						    pmap_pte_prot_chg(l3pte,
-						    bits)) {
-							hadasm =
-							   (pmap_pte_asm(l3pte)
-							    != 0);
+								      bits)) {
+							opte = atomic_load_relaxed(l3pte);
 							pmap_pte_set_prot(l3pte,
 							   bits);
-							PMAP_INVALIDATE_TLB(
-							   pmap, sva, hadasm,
-							   isactive, cpu_id);
-							PMAP_TLB_SHOOTDOWN(
-							   pmap, sva,
-							   hadasm ? PG_ASM : 0);
+							pmap_tlb_shootdown(pmap,
+							    sva, opte, &tlbctx);
 						}
 					}
 				}
@@ -1450,12 +1931,86 @@ pmap_protect(pmap_t pmap, vaddr_t sva, v
 		}
 	}
 
-	PMAP_TLB_SHOOTNOW();
+	PMAP_UNLOCK(pmap);
+	pmap_tlb_shootnow(&tlbctx);
+	TLB_COUNT(reason_protect);
+}
 
-	if (prot & VM_PROT_EXECUTE)
-		PMAP_SYNC_ISTREAM(pmap);
+/*
+ * pmap_enter_tlb_shootdown:
+ *
+ *	Carry out a TLB shootdown on behalf of a pmap_enter()
+ *	or a pmap_kenter_pa().  This is factored out separately
+ *	because we expect it to be not a common case.
+ */
+static void __noinline
+pmap_enter_tlb_shootdown(pmap_t const pmap, vaddr_t const va,
+    pt_entry_t const pte_bits, bool locked)
+{
+	struct pmap_tlb_context tlbctx;
+
+	pmap_tlb_context_init(&tlbctx);
+	pmap_tlb_shootdown(pmap, va, pte_bits, &tlbctx);
+	if (locked) {
+		PMAP_UNLOCK(pmap);
+	}
+	pmap_tlb_shootnow(&tlbctx);
+}
+
+/*
+ * pmap_enter_l2pt_delref:
+ *
+ *	Release a reference on an L2 PT page for pmap_enter().
+ *	This is factored out separately becacause we expect it
+ *	to be a rare case.
+ */
+static void __noinline
+pmap_enter_l2pt_delref(pmap_t const pmap, pt_entry_t * const l1pte,
+    pt_entry_t * const l2pte)
+{
+	struct pmap_tlb_context tlbctx;
 
+	/*
+	 * PALcode may have tried to service a TLB miss with
+	 * this L2 PTE, so we need to make sure we don't actully
+	 * free the PT page untl we've shot down any TLB entries
+	 * for this VPT index.
+	 */
+
+	pmap_tlb_context_init(&tlbctx);
+	pmap_l2pt_delref(pmap, l1pte, l2pte, &tlbctx);
 	PMAP_UNLOCK(pmap);
+	pmap_tlb_shootnow(&tlbctx);
+	pmap_tlb_ptpage_drain(&tlbctx);
+	TLB_COUNT(reason_enter_l2pt_delref);
+}
+
+/*
+ * pmap_enter_l3pt_delref:
+ *
+ *	Release a reference on an L3 PT page for pmap_enter().
+ *	This is factored out separately becacause we expect it
+ *	to be a rare case.
+ */
+static void __noinline
+pmap_enter_l3pt_delref(pmap_t const pmap, vaddr_t const va,
+    pt_entry_t * const pte)
+{
+	struct pmap_tlb_context tlbctx;
+
+	/*
+	 * PALcode may have tried to service a TLB miss with
+	 * this PTE, so we need to make sure we don't actully
+	 * free the PT page untl we've shot down any TLB entries
+	 * for this VPT index.
+	 */
+
+	pmap_tlb_context_init(&tlbctx);
+	pmap_l3pt_delref(pmap, va, pte, &tlbctx);
+	PMAP_UNLOCK(pmap);
+	pmap_tlb_shootnow(&tlbctx);
+	pmap_tlb_ptpage_drain(&tlbctx);
+	TLB_COUNT(reason_enter_l3pt_delref);
 }
 
 /*
@@ -1475,68 +2030,31 @@ pmap_protect(pmap_t pmap, vaddr_t sva, v
 int
 pmap_enter(pmap_t pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
 {
-	struct vm_page *pg;			/* if != NULL, managed page */
 	pt_entry_t *pte, npte, opte;
 	pv_entry_t opv = NULL;
 	paddr_t opa;
-	bool tflush = true;
-	bool hadasm = false;	/* XXX gcc -Wuninitialized */
-	bool needisync = false;
-	bool setisync = false;
-	bool isactive;
-	bool wired;
-	long cpu_id = cpu_number();
+	bool tflush = false;
 	int error = 0;
 	kmutex_t *lock;
-	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
 
 #ifdef DEBUG
 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
 		printf("pmap_enter(%p, %lx, %lx, %x, %x)\n",
 		       pmap, va, pa, prot, flags);
 #endif
-	pg = PHYS_TO_VM_PAGE(pa);
-	isactive = PMAP_ISACTIVE(pmap, cpu_id);
-	wired = (flags & PMAP_WIRED) != 0;
-
-	/*
-	 * Determine what we need to do about the I-stream.  If
-	 * VM_PROT_EXECUTE is set, we mark a user pmap as needing
-	 * an I-sync on the way back out to userspace.  We always
-	 * need an immediate I-sync for the kernel pmap.
-	 */
-	if (prot & VM_PROT_EXECUTE) {
-		if (pmap == pmap_kernel())
-			needisync = true;
-		else {
-			setisync = true;
-			needisync = (pmap->pm_cpus != 0);
-		}
-	}
+	struct vm_page * const pg = PHYS_TO_VM_PAGE(pa);
+	const bool wired = (flags & PMAP_WIRED) != 0;
 
 	PMAP_MAP_TO_HEAD_LOCK();
 	PMAP_LOCK(pmap);
 
 	if (pmap == pmap_kernel()) {
-#ifdef DIAGNOSTIC
-		/*
-		 * Sanity check the virtual address.
-		 */
-		if (va < VM_MIN_KERNEL_ADDRESS)
-			panic("pmap_enter: kernel pmap, invalid va 0x%lx", va);
-#endif
+		KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
 		pte = PMAP_KERNEL_PTE(va);
 	} else {
 		pt_entry_t *l1pte, *l2pte;
 
-#ifdef DIAGNOSTIC
-		/*
-		 * Sanity check the virtual address.
-		 */
-		if (va >= VM_MAXUSER_ADDRESS)
-			panic("pmap_enter: user pmap, invalid va 0x%lx", va);
-#endif
-
+		KASSERT(va < VM_MAXUSER_ADDRESS);
 		KASSERT(pmap->pm_lev1map != kernel_lev1map);
 
 		/*
@@ -1548,9 +2066,9 @@ pmap_enter(pmap_t pmap, vaddr_t va, padd
 		l1pte = pmap_l1pte(pmap, va);
 		if (pmap_pte_v(l1pte) == 0) {
 			pmap_physpage_addref(l1pte);
-			error = pmap_ptpage_alloc(pmap, l1pte, PGU_L2PT);
+			error = pmap_ptpage_alloc(l1pte, PGU_L2PT);
 			if (error) {
-				pmap_l1pt_delref(pmap, l1pte, cpu_id);
+				pmap_l1pt_delref(pmap, l1pte);
 				if (flags & PMAP_CANFAIL)
 					goto out;
 				panic("pmap_enter: unable to create L2 PT "
@@ -1572,11 +2090,14 @@ pmap_enter(pmap_t pmap, vaddr_t va, padd
 		l2pte = pmap_l2pte(pmap, va, l1pte);
 		if (pmap_pte_v(l2pte) == 0) {
 			pmap_physpage_addref(l2pte);
-			error = pmap_ptpage_alloc(pmap, l2pte, PGU_L3PT);
+			error = pmap_ptpage_alloc(l2pte, PGU_L3PT);
 			if (error) {
-				pmap_l2pt_delref(pmap, l1pte, l2pte, cpu_id);
-				if (flags & PMAP_CANFAIL)
+				/* unlocks pmap */
+				pmap_enter_l2pt_delref(pmap, l1pte, l2pte);
+				if (flags & PMAP_CANFAIL) {
+					PMAP_LOCK(pmap);
 					goto out;
+				}
 				panic("pmap_enter: unable to create L3 PT "
 				    "page");
 			}
@@ -1594,25 +2115,14 @@ pmap_enter(pmap_t pmap, vaddr_t va, padd
 	}
 
 	/* Remember all of the old PTE; used for TBI check later. */
-	opte = *pte;
+	opte = atomic_load_relaxed(pte);
 
 	/*
 	 * Check to see if the old mapping is valid.  If not, validate the
 	 * new one immediately.
 	 */
-	if (pmap_pte_v(pte) == 0) {
-		/*
-		 * No need to invalidate the TLB in this case; an invalid
-		 * mapping won't be in the TLB, and a previously valid
-		 * mapping would have been flushed when it was invalidated.
-		 */
-		tflush = false;
-
-		/*
-		 * No need to synchronize the I-stream, either, for basically
-		 * the same reason.
-		 */
-		setisync = needisync = false;
+	if ((opte & PG_V) == 0) {
+		/* No TLB invalidatons needed for new mappings. */
 
 		if (pmap != pmap_kernel()) {
 			/*
@@ -1625,7 +2135,6 @@ pmap_enter(pmap_t pmap, vaddr_t va, padd
 	}
 
 	opa = pmap_pte_pa(pte);
-	hadasm = (pmap_pte_asm(pte) != 0);
 
 	if (opa == pa) {
 		/*
@@ -1638,18 +2147,14 @@ pmap_enter(pmap_t pmap, vaddr_t va, padd
 				printf("pmap_enter: wiring change -> %d\n",
 				    wired);
 #endif
-			/*
-			 * Adjust the wiring count.
-			 */
+			/* Adjust the wiring count. */
 			if (wired)
 				PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
 			else
 				PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
 		}
 
-		/*
-		 * Set the PTE.
-		 */
+		/* Set the PTE. */
 		goto validate;
 	}
 
@@ -1670,44 +2175,43 @@ pmap_enter(pmap_t pmap, vaddr_t va, padd
 		 */
 		pmap_physpage_addref(pte);
 	}
-	needisync |= pmap_remove_mapping(pmap, va, pte, true, cpu_id, &opv);
+	/* Already have the bits from opte above. */
+	(void) pmap_remove_mapping(pmap, va, pte, true, &opv, NULL);
 
  validate_enterpv:
-	/*
-	 * Enter the mapping into the pv_table if appropriate.
-	 */
+	/* Enter the mapping into the pv_table if appropriate. */
 	if (pg != NULL) {
 		error = pmap_pv_enter(pmap, pg, va, pte, true, opv);
-		opv = NULL;
 		if (error) {
-			pmap_l3pt_delref(pmap, va, pte, cpu_id);
-			if (flags & PMAP_CANFAIL)
+			/* This can only fail if opv == NULL */
+			KASSERT(opv == NULL);
+
+			/* unlocks pmap */
+			pmap_enter_l3pt_delref(pmap, va, pte);
+			if (flags & PMAP_CANFAIL) {
+				PMAP_LOCK(pmap);
 				goto out;
+			}
 			panic("pmap_enter: unable to enter mapping in PV "
 			    "table");
 		}
+		opv = NULL;
 	}
 
-	/*
-	 * Increment counters.
-	 */
+	/* Increment counters. */
 	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
 	if (wired)
 		PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
 
  validate:
-	/*
-	 * Build the new PTE.
-	 */
+	/* Build the new PTE. */
 	npte = ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap, prot) | PG_V;
 	if (pg != NULL) {
 		struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
 		int attrs;
 
-#ifdef DIAGNOSTIC
-		if ((flags & VM_PROT_ALL) & ~prot)
-			panic("pmap_enter: access type exceeds prot");
-#endif
+		KASSERT(((flags & VM_PROT_ALL) & ~prot) == 0);
+
 		lock = pmap_pvh_lock(pg);
 		mutex_enter(lock);
 		if (flags & VM_PROT_WRITE)
@@ -1717,9 +2221,7 @@ pmap_enter(pmap_t pmap, vaddr_t va, padd
 		attrs = md->pvh_attrs;
 		mutex_exit(lock);
 
-		/*
-		 * Set up referenced/modified emulation for new mapping.
-		 */
+		/* Set up referenced/modified emulation for new mapping. */
 		if ((attrs & PGA_REFERENCED) == 0)
 			npte |= PG_FOR | PG_FOW | PG_FOE;
 		else if ((attrs & PGA_MODIFIED) == 0)
@@ -1738,34 +2240,35 @@ pmap_enter(pmap_t pmap, vaddr_t va, padd
 #endif
 
 	/*
-	 * If the PALcode portion of the new PTE is the same as the
+	 * If the HW / PALcode portion of the new PTE is the same as the
 	 * old PTE, no TBI is necessary.
 	 */
-	if (PG_PALCODE(opte) == PG_PALCODE(npte))
-		tflush = false;
+	if (opte & PG_V) {
+		tflush = PG_PALCODE(opte) != PG_PALCODE(npte);
+	}
 
-	/*
-	 * Set the new PTE.
-	 */
-	PMAP_SET_PTE(pte, npte);
+	/* Set the new PTE. */
+	atomic_store_relaxed(pte, npte);
+
+out:
+	PMAP_MAP_TO_HEAD_UNLOCK();
 
 	/*
 	 * Invalidate the TLB entry for this VA and any appropriate
 	 * caches.
 	 */
 	if (tflush) {
-		PMAP_INVALIDATE_TLB(pmap, va, hadasm, isactive, cpu_id);
-		PMAP_TLB_SHOOTDOWN(pmap, va, hadasm ? PG_ASM : 0);
-		PMAP_TLB_SHOOTNOW();
-	}
-	if (setisync)
-		PMAP_SET_NEEDISYNC(pmap);
-	if (needisync)
-		PMAP_SYNC_ISTREAM(pmap);
+		/* unlocks pmap */
+		pmap_enter_tlb_shootdown(pmap, va, opte, true);
+		if (pmap == pmap_kernel()) {
+			TLB_COUNT(reason_enter_kernel);
+		} else {
+			TLB_COUNT(reason_enter_user);
+		}
+	} else {
+		PMAP_UNLOCK(pmap);
+	}
 
-out:
-	PMAP_UNLOCK(pmap);
-	PMAP_MAP_TO_HEAD_UNLOCK();
 	if (opv)
 		pmap_pv_free(opv);
 
@@ -1783,11 +2286,7 @@ out:
 void
 pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
 {
-	pt_entry_t *pte, npte;
-	long cpu_id = cpu_number();
-	bool needisync = false;
-	pmap_t pmap = pmap_kernel();
-	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
+	pmap_t const pmap = pmap_kernel();
 
 #ifdef DEBUG
 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
@@ -1795,48 +2294,41 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, v
 		    va, pa, prot);
 #endif
 
-#ifdef DIAGNOSTIC
-	/*
-	 * Sanity check the virtual address.
-	 */
-	if (va < VM_MIN_KERNEL_ADDRESS)
-		panic("pmap_kenter_pa: kernel pmap, invalid va 0x%lx", va);
-#endif
+	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
 
-	pte = PMAP_KERNEL_PTE(va);
+	pt_entry_t * const pte = PMAP_KERNEL_PTE(va);
 
-	if (pmap_pte_v(pte) == 0)
-		PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
-	if (pmap_pte_w(pte) == 0)
-		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
+	/* Build the new PTE. */
+	const pt_entry_t npte =
+	    ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap_kernel(), prot) |
+	    PG_V | PG_WIRED;
 
-	if ((prot & VM_PROT_EXECUTE) != 0 || pmap_pte_exec(pte))
-		needisync = true;
+	/* Set the new PTE. */
+	const pt_entry_t opte = atomic_load_relaxed(pte);
+	atomic_store_relaxed(pte, npte);
+	PMAP_MP(alpha_mb());
 
-	/*
-	 * Build the new PTE.
-	 */
-	npte = ((pa >> PGSHIFT) << PG_SHIFT) | pte_prot(pmap_kernel(), prot) |
-	    PG_V | PG_WIRED;
+	PMAP_STAT_INCR(pmap->pm_stats.resident_count, 1);
+	PMAP_STAT_INCR(pmap->pm_stats.wired_count, 1);
 
 	/*
-	 * Set the new PTE.
+	 * There should not have been anything here, previously,
+	 * so we can skip TLB shootdowns, etc. in the common case.
 	 */
-	PMAP_SET_PTE(pte, npte);
-#if defined(MULTIPROCESSOR)
-	alpha_mb();		/* XXX alpha_wmb()? */
-#endif
+	if (__predict_false(opte & PG_V)) {
+		const pt_entry_t diff = npte ^ opte;
 
-	/*
-	 * Invalidate the TLB entry for this VA and any appropriate
-	 * caches.
-	 */
-	PMAP_INVALIDATE_TLB(pmap, va, true, true, cpu_id);
-	PMAP_TLB_SHOOTDOWN(pmap, va, PG_ASM);
-	PMAP_TLB_SHOOTNOW();
+		printf_nolog("%s: mapping already present\n", __func__);
+		PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
+		if (diff & PG_WIRED)
+			PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
+		/* XXX Can't handle this case. */
+		if (diff & PG_PVLIST)
+			panic("pmap_kenter_pa: old mapping was managed");
 
-	if (needisync)
-		PMAP_SYNC_ISTREAM_KERNEL();
+		pmap_enter_tlb_shootdown(pmap_kernel(), va, opte, false);
+		TLB_COUNT(reason_kenter);
+	}
 }
 
 /*
@@ -1848,11 +2340,9 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, v
 void
 pmap_kremove(vaddr_t va, vsize_t size)
 {
-	pt_entry_t *pte;
-	bool needisync = false;
-	long cpu_id = cpu_number();
-	pmap_t pmap = pmap_kernel();
-	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
+	pt_entry_t *pte, opte;
+	pmap_t const pmap = pmap_kernel();
+	struct pmap_tlb_context tlbctx;
 
 #ifdef DEBUG
 	if (pmapdebug & (PDB_FOLLOW|PDB_ENTER))
@@ -1860,40 +2350,29 @@ pmap_kremove(vaddr_t va, vsize_t size)
 		    va, size);
 #endif
 
-#ifdef DIAGNOSTIC
-	if (va < VM_MIN_KERNEL_ADDRESS)
-		panic("pmap_kremove: user address");
-#endif
+	pmap_tlb_context_init(&tlbctx);
+
+	KASSERT(va >= VM_MIN_KERNEL_ADDRESS);
 
 	for (; size != 0; size -= PAGE_SIZE, va += PAGE_SIZE) {
 		pte = PMAP_KERNEL_PTE(va);
-		if (pmap_pte_v(pte)) {
-#ifdef DIAGNOSTIC
-			if (pmap_pte_pv(pte))
-				panic("pmap_kremove: PG_PVLIST mapping for "
-				    "0x%lx", va);
-#endif
-			if (pmap_pte_exec(pte))
-				needisync = true;
+		opte = atomic_load_relaxed(pte);
+		if (opte & PG_V) {
+			KASSERT((opte & PG_PVLIST) == 0);
 
 			/* Zap the mapping. */
-			PMAP_SET_PTE(pte, PG_NV);
-#if defined(MULTIPROCESSOR)
-			alpha_mb();		/* XXX alpha_wmb()? */
-#endif
-			PMAP_INVALIDATE_TLB(pmap, va, true, true, cpu_id);
-			PMAP_TLB_SHOOTDOWN(pmap, va, PG_ASM);
+			atomic_store_relaxed(pte, PG_NV);
+			pmap_tlb_shootdown(pmap, va, opte, &tlbctx);
 
 			/* Update stats. */
 			PMAP_STAT_DECR(pmap->pm_stats.resident_count, 1);
 			PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
 		}
 	}
+	PMAP_MP(alpha_wmb());
 
-	PMAP_TLB_SHOOTNOW();
-
-	if (needisync)
-		PMAP_SYNC_ISTREAM_KERNEL();
+	pmap_tlb_shootnow(&tlbctx);
+	TLB_COUNT(reason_kremove);
 }
 
 /*
@@ -1916,10 +2395,9 @@ pmap_unwire(pmap_t pmap, vaddr_t va)
 	PMAP_LOCK(pmap);
 
 	pte = pmap_l3pte(pmap, va, NULL);
-#ifdef DIAGNOSTIC
-	if (pte == NULL || pmap_pte_v(pte) == 0)
-		panic("pmap_unwire");
-#endif
+
+	KASSERT(pte != NULL);
+	KASSERT(pmap_pte_v(pte));
 
 	/*
 	 * If wiring actually changed (always?) clear the wire bit and
@@ -1930,7 +2408,7 @@ pmap_unwire(pmap_t pmap, vaddr_t va)
 		pmap_pte_set_w(pte, false);
 		PMAP_STAT_DECR(pmap->pm_stats.wired_count, 1);
 	}
-#ifdef DIAGNOSTIC
+#ifdef DEBUG
 	else {
 		printf("pmap_unwire: wiring for pmap %p va 0x%lx "
 		    "didn't change!\n", pmap, va);
@@ -1961,15 +2439,19 @@ pmap_extract(pmap_t pmap, vaddr_t va, pa
 	 * Take a faster path for the kernel pmap.  Avoids locking,
 	 * handles K0SEG.
 	 */
-	if (pmap == pmap_kernel()) {
-		pa = vtophys(va);
-		if (pap != NULL)
-			*pap = pa;
+	if (__predict_true(pmap == pmap_kernel())) {
+		if (__predict_true(vtophys_internal(va, pap))) {
+#ifdef DEBUG
+			if (pmapdebug & PDB_FOLLOW)
+				printf("0x%lx (kernel vtophys)\n", pa);
+#endif
+			return true;
+		}
 #ifdef DEBUG
 		if (pmapdebug & PDB_FOLLOW)
-			printf("0x%lx (kernel vtophys)\n", pa);
+			printf("failed (kernel vtophys)\n");
 #endif
-		return (pa != 0);	/* XXX */
+		return false;
 	}
 
 	PMAP_LOCK(pmap);
@@ -2049,19 +2531,23 @@ pmap_activate(struct lwp *l)
 
 	KASSERT(l == ci->ci_curlwp);
 
-	/* Mark the pmap in use by this processor. */
-	atomic_or_ulong(&pmap->pm_cpus, (1UL << ci->ci_cpuid));
-
-	/* Allocate an ASN. */
-	pmap_asn_alloc(pmap, ci->ci_cpuid);
-	PMAP_ACTIVATE_ASN_SANITY(pmap, ci->ci_cpuid);
-
 	u_long const old_ptbr = pcb->pcb_hw.apcb_ptbr;
 	u_int const old_asn = pcb->pcb_hw.apcb_asn;
 
+	/*
+	 * We hold the activation lock to synchronize with TLB shootdown.
+	 * The kernel pmap does not require those tests because shootdowns
+	 * for the kernel pmap are always sent to all CPUs.
+	 */
+	if (pmap != pmap_kernel()) {
+		PMAP_ACT_LOCK(pmap);
+		pcb->pcb_hw.apcb_asn = pmap_asn_alloc(pmap, ci);
+		atomic_or_ulong(&pmap->pm_cpus, (1UL << ci->ci_cpuid));
+	} else {
+		pcb->pcb_hw.apcb_asn = PMAP_ASN_KERNEL;
+	}
 	pcb->pcb_hw.apcb_ptbr =
 	    ALPHA_K0SEG_TO_PHYS((vaddr_t)pmap->pm_lev1map) >> PGSHIFT;
-	pcb->pcb_hw.apcb_asn = (pmap)->pm_asni[ci->ci_cpuid].pma_asn;
 
 	/*
 	 * Check to see if the ASN or page table base has changed; if
@@ -2072,10 +2558,26 @@ pmap_activate(struct lwp *l)
 	 */
 	if (old_asn != pcb->pcb_hw.apcb_asn ||
 	    old_ptbr != pcb->pcb_hw.apcb_ptbr) {
+		if (old_asn != pcb->pcb_hw.apcb_asn &&
+		    old_ptbr != pcb->pcb_hw.apcb_ptbr) {
+			TLB_COUNT(activate_both_change);
+		} else if (old_asn != pcb->pcb_hw.apcb_asn) {
+			TLB_COUNT(activate_asn_change);
+		} else {
+			TLB_COUNT(activate_ptbr_change);
+		}
 		(void) alpha_pal_swpctx((u_long)l->l_md.md_pcbpaddr);
+		TLB_COUNT(activate_swpctx);
+	} else {
+		TLB_COUNT(activate_skip_swpctx);
 	}
 
+	pmap_reference(pmap);
 	ci->ci_pmap = pmap;
+
+	if (pmap != pmap_kernel()) {
+		PMAP_ACT_UNLOCK(pmap);
+	}
 }
 
 /*
@@ -2099,8 +2601,7 @@ pmap_deactivate(struct lwp *l)
 	struct cpu_info * const ci = curcpu();
 
 	KASSERT(l == ci->ci_curlwp);
-
-	atomic_and_ulong(&pmap->pm_cpus, ~(1UL << ci->ci_cpuid));
+	KASSERT(pmap == ci->ci_pmap);
 
 	/*
 	 * There is no need to switch to a different PTBR here,
@@ -2113,6 +2614,9 @@ pmap_deactivate(struct lwp *l)
 	 * the kernel pmap.
 	 */
 	ci->ci_pmap = pmap_kernel();
+	PMAP_MP(alpha_mb());
+	KASSERT(atomic_load_relaxed(&pmap->pm_count) > 1);
+	pmap_destroy(pmap);
 }
 
 /*
@@ -2232,27 +2736,32 @@ pmap_clear_modify(struct vm_page *pg)
 {
 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
 	bool rv = false;
-	long cpu_id = cpu_number();
 	kmutex_t *lock;
+	struct pmap_tlb_context tlbctx;
 
 #ifdef DEBUG
 	if (pmapdebug & PDB_FOLLOW)
 		printf("pmap_clear_modify(%p)\n", pg);
 #endif
 
+	pmap_tlb_context_init(&tlbctx);
+
 	PMAP_HEAD_TO_MAP_LOCK();
 	lock = pmap_pvh_lock(pg);
 	mutex_enter(lock);
 
 	if (md->pvh_attrs & PGA_MODIFIED) {
 		rv = true;
-		pmap_changebit(pg, PG_FOW, ~0, cpu_id);
+		pmap_changebit(pg, PG_FOW, ~0UL, &tlbctx);
 		md->pvh_attrs &= ~PGA_MODIFIED;
 	}
 
 	mutex_exit(lock);
 	PMAP_HEAD_TO_MAP_UNLOCK();
 
+	pmap_tlb_shootnow(&tlbctx);
+	TLB_COUNT(reason_clear_modify);
+
 	return (rv);
 }
 
@@ -2266,27 +2775,32 @@ pmap_clear_reference(struct vm_page *pg)
 {
 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
 	bool rv = false;
-	long cpu_id = cpu_number();
 	kmutex_t *lock;
+	struct pmap_tlb_context tlbctx;
 
 #ifdef DEBUG
 	if (pmapdebug & PDB_FOLLOW)
 		printf("pmap_clear_reference(%p)\n", pg);
 #endif
 
+	pmap_tlb_context_init(&tlbctx);
+
 	PMAP_HEAD_TO_MAP_LOCK();
 	lock = pmap_pvh_lock(pg);
 	mutex_enter(lock);
 
 	if (md->pvh_attrs & PGA_REFERENCED) {
 		rv = true;
-		pmap_changebit(pg, PG_FOR | PG_FOW | PG_FOE, ~0, cpu_id);
+		pmap_changebit(pg, PG_FOR | PG_FOW | PG_FOE, ~0UL, &tlbctx);
 		md->pvh_attrs &= ~PGA_REFERENCED;
 	}
 
 	mutex_exit(lock);
 	PMAP_HEAD_TO_MAP_UNLOCK();
 
+	pmap_tlb_shootnow(&tlbctx);
+	TLB_COUNT(reason_clear_reference);
+
 	return (rv);
 }
 
@@ -2377,25 +2891,21 @@ alpha_protection_init(void)
  *	from beneath it.  We assume that the pmap itself is already
  *	locked; dolock applies only to the PV list.
  *
- *	Returns true or false, indicating if an I-stream sync needs
- *	to be initiated (for this CPU or for other CPUs).
+ *	Returns important PTE bits that the caller needs to check for
+ *	TLB / I-stream invalidation purposes.
  */
-static bool
+static pt_entry_t
 pmap_remove_mapping(pmap_t pmap, vaddr_t va, pt_entry_t *pte,
-    bool dolock, long cpu_id, pv_entry_t *opvp)
+    bool dolock, pv_entry_t *opvp, struct pmap_tlb_context * const tlbctx)
 {
+	pt_entry_t opte;
 	paddr_t pa;
 	struct vm_page *pg;		/* if != NULL, page is managed */
-	bool onpv;
-	bool hadasm;
-	bool isactive;
-	bool needisync = false;
-	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
 
 #ifdef DEBUG
 	if (pmapdebug & (PDB_FOLLOW|PDB_REMOVE|PDB_PROTECT))
-		printf("pmap_remove_mapping(%p, %lx, %p, %d, %ld, %p)\n",
-		       pmap, va, pte, dolock, cpu_id, opvp);
+		printf("pmap_remove_mapping(%p, %lx, %p, %d, %p)\n",
+		       pmap, va, pte, dolock, opvp);
 #endif
 
 	/*
@@ -2404,28 +2914,12 @@ pmap_remove_mapping(pmap_t pmap, vaddr_t
 	if (pte == NULL) {
 		pte = pmap_l3pte(pmap, va, NULL);
 		if (pmap_pte_v(pte) == 0)
-			return (false);
+			return 0;
 	}
 
-	pa = pmap_pte_pa(pte);
-	onpv = (pmap_pte_pv(pte) != 0);
-	hadasm = (pmap_pte_asm(pte) != 0);
-	isactive = PMAP_ISACTIVE(pmap, cpu_id);
-
-	/*
-	 * Determine what we need to do about the I-stream.  If
-	 * PG_EXEC was set, we mark a user pmap as needing an
-	 * I-sync on the way out to userspace.  We always need
-	 * an immediate I-sync for the kernel pmap.
-	 */
-	if (pmap_pte_exec(pte)) {
-		if (pmap == pmap_kernel())
-			needisync = true;
-		else {
-			PMAP_SET_NEEDISYNC(pmap);
-			needisync = (pmap->pm_cpus != 0);
-		}
-	}
+	opte = *pte;
+
+	pa = PG_PFNUM(opte) << PGSHIFT;
 
 	/*
 	 * Update statistics
@@ -2441,11 +2935,7 @@ pmap_remove_mapping(pmap_t pmap, vaddr_t
 	if (pmapdebug & PDB_REMOVE)
 		printf("remove: invalidating pte at %p\n", pte);
 #endif
-	PMAP_SET_PTE(pte, PG_NV);
-
-	PMAP_INVALIDATE_TLB(pmap, va, hadasm, isactive, cpu_id);
-	PMAP_TLB_SHOOTDOWN(pmap, va, hadasm ? PG_ASM : 0);
-	PMAP_TLB_SHOOTNOW();
+	atomic_store_relaxed(pte, PG_NV);
 
 	/*
 	 * If we're removing a user mapping, check to see if we
@@ -2457,24 +2947,20 @@ pmap_remove_mapping(pmap_t pmap, vaddr_t
 		 * delete references on the level 2 and 1 tables as
 		 * appropriate.
 		 */
-		pmap_l3pt_delref(pmap, va, pte, cpu_id);
+		pmap_l3pt_delref(pmap, va, pte, tlbctx);
 	}
 
-	/*
-	 * If the mapping wasn't entered on the PV list, we're all done.
-	 */
-	if (onpv == false)
-		return (needisync);
-
-	/*
-	 * Remove it from the PV table.
-	 */
-	pg = PHYS_TO_VM_PAGE(pa);
-	KASSERT(pg != NULL);
-	pmap_pv_remove(pmap, pg, va, dolock, opvp);
-	KASSERT(opvp == NULL || *opvp != NULL);
+	if (opte & PG_PVLIST) {
+		/*
+		 * Remove it from the PV table.
+		 */
+		pg = PHYS_TO_VM_PAGE(pa);
+		KASSERT(pg != NULL);
+		pmap_pv_remove(pmap, pg, va, dolock, opvp);
+		KASSERT(opvp == NULL || *opvp != NULL);
+	}
 
-	return (needisync);
+	return opte & (PG_V | PG_ASM | PG_EXEC);
 }
 
 /*
@@ -2488,14 +2974,12 @@ pmap_remove_mapping(pmap_t pmap, vaddr_t
  *	the pmaps as we encounter them.
  */
 static void
-pmap_changebit(struct vm_page *pg, u_long set, u_long mask, long cpu_id)
+pmap_changebit(struct vm_page *pg, pt_entry_t set, pt_entry_t mask,
+    struct pmap_tlb_context * const tlbctx)
 {
 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
 	pv_entry_t pv;
-	pt_entry_t *pte, npte;
-	vaddr_t va;
-	bool hadasm, isactive;
-	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
+	pt_entry_t *pte, npte, opte;
 
 #ifdef DEBUG
 	if (pmapdebug & PDB_BITS)
@@ -2507,25 +2991,18 @@ pmap_changebit(struct vm_page *pg, u_lon
 	 * Loop over all current mappings setting/clearing as apropos.
 	 */
 	for (pv = md->pvh_list; pv != NULL; pv = pv->pv_next) {
-		va = pv->pv_va;
-
 		PMAP_LOCK(pv->pv_pmap);
 
 		pte = pv->pv_pte;
-		npte = (*pte | set) & mask;
-		if (*pte != npte) {
-			hadasm = (pmap_pte_asm(pte) != 0);
-			isactive = PMAP_ISACTIVE(pv->pv_pmap, cpu_id);
-			PMAP_SET_PTE(pte, npte);
-			PMAP_INVALIDATE_TLB(pv->pv_pmap, va, hadasm, isactive,
-			    cpu_id);
-			PMAP_TLB_SHOOTDOWN(pv->pv_pmap, va,
-			    hadasm ? PG_ASM : 0);
+
+		opte = atomic_load_relaxed(pte);
+		npte = (opte | set) & mask;
+		if (npte != opte) {
+			atomic_store_relaxed(pte, npte);
+			pmap_tlb_shootdown_pv(pv, opte, tlbctx);
 		}
 		PMAP_UNLOCK(pv->pv_pmap);
 	}
-
-	PMAP_TLB_SHOOTNOW();
 }
 
 /*
@@ -2544,7 +3021,6 @@ pmap_emulate_reference(struct lwp *l, va
 	paddr_t pa;
 	bool didlock = false;
 	bool exec = false;
-	long cpu_id = cpu_number();
 	kmutex_t *lock;
 
 #ifdef DEBUG
@@ -2589,22 +3065,7 @@ pmap_emulate_reference(struct lwp *l, va
 		printf("*pte = 0x%lx\n", *pte);
 	}
 #endif
-#if 0/*DEBUG*/	/* These checks are, expensive, racy, and unreliable. */
-	if (!pmap_pte_v(pte))
-		panic("pmap_emulate_reference: invalid pte");
-	if (type == ALPHA_MMCSR_FOW) {
-		if (!(*pte & (user ? PG_UWE : PG_UWE | PG_KWE)))
-			panic("pmap_emulate_reference: write but unwritable");
-		if (!(*pte & PG_FOW))
-			panic("pmap_emulate_reference: write but not FOW");
-	} else {
-		if (!(*pte & (user ? PG_URE : PG_URE | PG_KRE)))
-			panic("pmap_emulate_reference: !write but unreadable");
-		if (!(*pte & (PG_FOR | PG_FOE)))
-			panic("pmap_emulate_reference: !write but not FOR|FOE");
-	}
-	/* Other diagnostics? */
-#endif
+
 	pa = pmap_pte_pa(pte);
 
 	/*
@@ -2634,6 +3095,9 @@ pmap_emulate_reference(struct lwp *l, va
 	 */
 	pg = PHYS_TO_VM_PAGE(pa);
 	struct vm_page_md * const md = VM_PAGE_TO_MD(pg);
+	struct pmap_tlb_context tlbctx;
+
+	pmap_tlb_context_init(&tlbctx);
 
 	PMAP_HEAD_TO_MAP_LOCK();
 	lock = pmap_pvh_lock(pg);
@@ -2649,10 +3113,14 @@ pmap_emulate_reference(struct lwp *l, va
 			faultoff |= PG_FOE;
 		}
 	}
-	pmap_changebit(pg, 0, ~faultoff, cpu_id);
+	pmap_changebit(pg, 0, ~faultoff, &tlbctx);
 
 	mutex_exit(lock);
 	PMAP_HEAD_TO_MAP_UNLOCK();
+
+	pmap_tlb_shootnow(&tlbctx);
+	TLB_COUNT(reason_emulate_reference);
+
 	return (0);
 }
 
@@ -2694,27 +3162,38 @@ pmap_pv_dump(paddr_t pa)
  *
  *	Note: no locking is necessary in this function.
  */
-paddr_t
-vtophys(vaddr_t vaddr)
+static bool
+vtophys_internal(vaddr_t const vaddr, paddr_t * const pap)
 {
-	pt_entry_t *pte;
-	paddr_t paddr = 0;
+	paddr_t pa;
 
-	if (vaddr < ALPHA_K0SEG_BASE)
-		printf("vtophys: invalid vaddr 0x%lx", vaddr);
-	else if (vaddr <= ALPHA_K0SEG_END)
-		paddr = ALPHA_K0SEG_TO_PHYS(vaddr);
-	else {
-		pte = PMAP_KERNEL_PTE(vaddr);
-		if (pmap_pte_v(pte))
-			paddr = pmap_pte_pa(pte) | (vaddr & PGOFSET);
+	KASSERT(vaddr >= ALPHA_K0SEG_BASE);
+
+	if (vaddr <= ALPHA_K0SEG_END) {
+		pa = ALPHA_K0SEG_TO_PHYS(vaddr);
+	} else {
+		pt_entry_t * const pte = PMAP_KERNEL_PTE(vaddr);
+		if (__predict_false(! pmap_pte_v(pte))) {
+			return false;
+		}
+		pa = pmap_pte_pa(pte) | (vaddr & PGOFSET);
 	}
 
-#if 0
-	printf("vtophys(0x%lx) -> 0x%lx\n", vaddr, paddr);
-#endif
+	if (pap != NULL) {
+		*pap = pa;
+	}
 
-	return (paddr);
+	return true;
+}
+
+paddr_t
+vtophys(vaddr_t const vaddr)
+{
+	paddr_t pa;
+
+	if (__predict_false(! vtophys_internal(vaddr, &pa)))
+		pa = 0;
+	return pa;
 }
 
 /******************** pv_entry management ********************/
@@ -3054,59 +3533,6 @@ pmap_growkernel(vaddr_t maxkvaddr)
 }
 
 /*
- * pmap_lev1map_create:
- *
- *	Create a new level 1 page table for the specified pmap.
- *
- *	Note: growkernel must already be held and the pmap either
- *	already locked or unreferenced globally.
- */
-static int
-pmap_lev1map_create(pmap_t pmap, long cpu_id)
-{
-	pt_entry_t *l1pt;
-
-	KASSERT(pmap != pmap_kernel());
-
-	KASSERT(pmap->pm_lev1map == kernel_lev1map);
-	KASSERT(pmap->pm_asni[cpu_id].pma_asn == PMAP_ASN_RESERVED);
-
-	/* Don't sleep -- we're called with locks held. */
-	l1pt = pool_cache_get(&pmap_l1pt_cache, PR_NOWAIT);
-	if (l1pt == NULL)
-		return (ENOMEM);
-
-	pmap->pm_lev1map = l1pt;
-	return (0);
-}
-
-/*
- * pmap_lev1map_destroy:
- *
- *	Destroy the level 1 page table for the specified pmap.
- *
- *	Note: growkernel must be held and the pmap must already be
- *	locked or not globally referenced.
- */
-static void
-pmap_lev1map_destroy(pmap_t pmap, long cpu_id)
-{
-	pt_entry_t *l1pt = pmap->pm_lev1map;
-
-	KASSERT(pmap != pmap_kernel());
-
-	/*
-	 * Go back to referencing the global kernel_lev1map.
-	 */
-	pmap->pm_lev1map = kernel_lev1map;
-
-	/*
-	 * Free the old level 1 page table page.
-	 */
-	pool_cache_put(&pmap_l1pt_cache, l1pt);
-}
-
-/*
  * pmap_l1pt_ctor:
  *
  *	Pool cache constructor for L1 PT pages.
@@ -3177,13 +3603,13 @@ pmap_l1pt_free(struct pool *pp, void *v)
 /*
  * pmap_ptpage_alloc:
  *
- *	Allocate a level 2 or level 3 page table page, and
- *	initialize the PTE that references it.
+ *	Allocate a level 2 or level 3 page table page for a user
+ *	pmap, and initialize the PTE that references it.
  *
  *	Note: the pmap must already be locked.
  */
 static int
-pmap_ptpage_alloc(pmap_t pmap, pt_entry_t *pte, int usage)
+pmap_ptpage_alloc(pt_entry_t * const pte, int const usage)
 {
 	paddr_t ptpa;
 
@@ -3196,9 +3622,10 @@ pmap_ptpage_alloc(pmap_t pmap, pt_entry_
 	/*
 	 * Initialize the referencing PTE.
 	 */
-	PMAP_SET_PTE(pte, ((ptpa >> PGSHIFT) << PG_SHIFT) |
-	    PG_V | PG_KRE | PG_KWE | PG_WIRED |
-	    (pmap == pmap_kernel() ? PG_ASM : 0));
+	const pt_entry_t npte = ((ptpa >> PGSHIFT) << PG_SHIFT) |
+	    PG_V | PG_KRE | PG_KWE | PG_WIRED;
+
+	atomic_store_relaxed(pte, npte);
 
 	return (0);
 }
@@ -3212,21 +3639,20 @@ pmap_ptpage_alloc(pmap_t pmap, pt_entry_
  *	Note: the pmap must already be locked.
  */
 static void
-pmap_ptpage_free(pmap_t pmap, pt_entry_t *pte)
+pmap_ptpage_free(pt_entry_t * const pte, struct pmap_tlb_context * const tlbctx)
 {
-	paddr_t ptpa;
 
 	/*
 	 * Extract the physical address of the page from the PTE
 	 * and clear the entry.
 	 */
-	ptpa = pmap_pte_pa(pte);
-	PMAP_SET_PTE(pte, PG_NV);
+	const paddr_t ptpa = pmap_pte_pa(pte);
+	atomic_store_relaxed(pte, PG_NV);
 
 #ifdef DEBUG
 	pmap_zero_page(ptpa);
 #endif
-	pmap_physpage_free(ptpa);
+	pmap_tlb_physpage_free(ptpa, tlbctx);
 }
 
 /*
@@ -3238,10 +3664,10 @@ pmap_ptpage_free(pmap_t pmap, pt_entry_t
  *	Note: the pmap must already be locked.
  */
 static void
-pmap_l3pt_delref(pmap_t pmap, vaddr_t va, pt_entry_t *l3pte, long cpu_id)
+pmap_l3pt_delref(pmap_t pmap, vaddr_t va, pt_entry_t *l3pte,
+    struct pmap_tlb_context * const tlbctx)
 {
 	pt_entry_t *l1pte, *l2pte;
-	PMAP_TLB_SHOOTDOWN_CPUSET_DECL
 
 	l1pte = pmap_l1pte(pmap, va);
 	l2pte = pmap_l2pte(pmap, va, l1pte);
@@ -3260,28 +3686,26 @@ pmap_l3pt_delref(pmap_t pmap, vaddr_t va
 			printf("pmap_l3pt_delref: freeing level 3 table at "
 			    "0x%lx\n", pmap_pte_pa(l2pte));
 #endif
-		pmap_ptpage_free(pmap, l2pte);
+		/*
+		 * You can pass NULL if you know the last refrence won't
+		 * be dropped.
+		 */
+		KASSERT(tlbctx != NULL);
+		pmap_ptpage_free(l2pte, tlbctx);
 
 		/*
-		 * We've freed a level 3 table, so we must
-		 * invalidate the TLB entry for that PT page
-		 * in the Virtual Page Table VA range, because
-		 * otherwise the PALcode will service a TLB
-		 * miss using the stale VPT TLB entry it entered
-		 * behind our back to shortcut to the VA's PTE.
-		 */
-		PMAP_INVALIDATE_TLB(pmap,
-		    (vaddr_t)(&VPT[VPT_INDEX(va)]), false,
-		    PMAP_ISACTIVE(pmap, cpu_id), cpu_id);
-		PMAP_TLB_SHOOTDOWN(pmap,
-		    (vaddr_t)(&VPT[VPT_INDEX(va)]), 0);
-		PMAP_TLB_SHOOTNOW();
+		 * We've freed a level 3 table, so we must invalidate
+		 * any now-stale TLB entries for the corresponding VPT
+		 * VA range.  Easiest way to guarantee this is to hit
+		 * all of the user TLB entries.
+		 */
+		pmap_tlb_shootdown_all_user(pmap, PG_V, tlbctx);
 
 		/*
 		 * We've freed a level 3 table, so delete the reference
 		 * on the level 2 table.
 		 */
-		pmap_l2pt_delref(pmap, l1pte, l2pte, cpu_id);
+		pmap_l2pt_delref(pmap, l1pte, l2pte, tlbctx);
 	}
 }
 
@@ -3295,7 +3719,7 @@ pmap_l3pt_delref(pmap_t pmap, vaddr_t va
  */
 static void
 pmap_l2pt_delref(pmap_t pmap, pt_entry_t *l1pte, pt_entry_t *l2pte,
-    long cpu_id)
+    struct pmap_tlb_context * const tlbctx)
 {
 
 #ifdef DIAGNOSTIC
@@ -3313,32 +3737,39 @@ pmap_l2pt_delref(pmap_t pmap, pt_entry_t
 			printf("pmap_l2pt_delref: freeing level 2 table at "
 			    "0x%lx\n", pmap_pte_pa(l1pte));
 #endif
-		pmap_ptpage_free(pmap, l1pte);
+		/*
+		 * You can pass NULL if you know the last refrence won't
+		 * be dropped.
+		 */
+		KASSERT(tlbctx != NULL);
+		pmap_ptpage_free(l1pte, tlbctx);
+
+		/*
+		 * We've freed a level 2 table, so we must invalidate
+		 * any now-stale TLB entries for the corresponding VPT
+		 * VA range.  Easiest way to guarantee this is to hit
+		 * all of the user TLB entries.
+		 */
+		pmap_tlb_shootdown_all_user(pmap, PG_V, tlbctx);
 
 		/*
 		 * We've freed a level 2 table, so delete the reference
 		 * on the level 1 table.
 		 */
-		pmap_l1pt_delref(pmap, l1pte, cpu_id);
+		pmap_l1pt_delref(pmap, l1pte);
 	}
 }
 
 /*
  * pmap_l1pt_delref:
  *
- *	Delete a reference on a level 1 PT page.  If the reference drops
- *	to zero, free it.
- *
- *	Note: the pmap must already be locked.
+ *	Delete a reference on a level 1 PT page.
  */
 static void
-pmap_l1pt_delref(pmap_t pmap, pt_entry_t *l1pte, long cpu_id)
+pmap_l1pt_delref(pmap_t pmap, pt_entry_t *l1pte)
 {
 
-#ifdef DIAGNOSTIC
-	if (pmap == pmap_kernel())
-		panic("pmap_l1pt_delref: kernel pmap");
-#endif
+	KASSERT(pmap != pmap_kernel());
 
 	(void)pmap_physpage_delref(l1pte);
 }
@@ -3354,66 +3785,48 @@ pmap_l1pt_delref(pmap_t pmap, pt_entry_t
  *	an interprocessor interrupt, and in that case, the sender of
  *	the IPI has the pmap lock.
  */
-static void
-pmap_asn_alloc(pmap_t pmap, long cpu_id)
+static u_int
+pmap_asn_alloc(pmap_t const pmap, struct cpu_info * const ci)
 {
-	struct pmap_asn_info *pma = &pmap->pm_asni[cpu_id];
-	struct pmap_asn_info *cpma = &pmap_asn_info[cpu_id];
 
 #ifdef DEBUG
 	if (pmapdebug & (PDB_FOLLOW|PDB_ASN))
 		printf("pmap_asn_alloc(%p)\n", pmap);
 #endif
 
-	/* No work to do for kernel pmap; all kernel mappings have ASM. */
-	if (pmap == pmap_kernel())
-		return;
-
-	/* No user pmaps should reference the kernel_lev1map. */
+	KASSERT(pmap != pmap_kernel());
 	KASSERT(pmap->pm_lev1map != kernel_lev1map);
-
 	KASSERT(kpreempt_disabled());
 
-	/*
-	 * On processors which do not implement ASNs, the swpctx PALcode
-	 * operation will automatically invalidate the TLB and I-cache,
-	 * so we don't need to do that here.
-	 */
-	if (pmap_max_asn == 0) {
-		/*
-		 * Refresh the pmap's generation number, to
-		 * simplify logic elsewhere.
-		 */
-		pma->pma_asngen = cpma->pma_asngen;
-#ifdef DEBUG
-		if (pmapdebug & PDB_ASN)
-			printf("pmap_asn_alloc: no ASNs, using asngen %lu\n",
-			    pma->pma_asngen);
-#endif
-		return;
-	}
+	/* No work to do if the the CPU does not implement ASNs. */
+	if (pmap_max_asn == 0)
+		return 0;
+
+	struct pmap_asn_info * const pma = &pmap->pm_asni[ci->ci_cpuid];
 
 	/*
 	 * Hopefully, we can continue using the one we have...
+	 *
+	 * N.B. the generation check will fail the first time
+	 * any pmap is activated on a given CPU, because we start
+	 * the generation counter at 1, but initialize pmaps with
+	 * 0; this forces the first ASN allocation to occur.
 	 */
-	if (pma->pma_asn != PMAP_ASN_RESERVED &&
-	    pma->pma_asngen == cpma->pma_asngen) {
-		/*
-		 * ASN is still in the current generation; keep on using it.
-		 */
+	if (pma->pma_asngen == ci->ci_asn_gen) {
 #ifdef DEBUG
 		if (pmapdebug & PDB_ASN)
 			printf("pmap_asn_alloc: same generation, keeping %u\n",
 			    pma->pma_asn);
 #endif
-		return;
+		TLB_COUNT(asn_reuse);
+		return pma->pma_asn;
 	}
 
 	/*
 	 * Need to assign a new ASN.  Grab the next one, incrementing
 	 * the generation number if we have to.
 	 */
-	if (cpma->pma_asn > pmap_max_asn) {
+	if (ci->ci_next_asn > pmap_max_asn) {
 		/*
 		 * Invalidate all non-PG_ASM TLB entries and the
 		 * I-cache, and bump the generation number.
@@ -3421,8 +3834,9 @@ pmap_asn_alloc(pmap_t pmap, long cpu_id)
 		ALPHA_TBIAP();
 		alpha_pal_imb();
 
-		cpma->pma_asn = 1;
-		cpma->pma_asngen++;
+		ci->ci_next_asn = PMAP_ASN_FIRST_USER;
+		ci->ci_asn_gen++;
+		TLB_COUNT(asn_newgen);
 
 		/*
 		 * Make sure the generation number doesn't wrap.  We could
@@ -3437,182 +3851,31 @@ pmap_asn_alloc(pmap_t pmap, long cpu_id)
 		 *
 		 * So, we don't bother.
 		 */
-		KASSERT(cpma->pma_asngen != 0);
+		KASSERT(ci->ci_asn_gen != PMAP_ASNGEN_INVALID);
 #ifdef DEBUG
 		if (pmapdebug & PDB_ASN)
 			printf("pmap_asn_alloc: generation bumped to %lu\n",
-			    cpma->pma_asngen);
+			    ci->ci_asn_ge);
 #endif
 	}
 
 	/*
 	 * Assign the new ASN and validate the generation number.
 	 */
-	pma->pma_asn = cpma->pma_asn++;
-	pma->pma_asngen = cpma->pma_asngen;
+	pma->pma_asn = ci->ci_next_asn++;
+	pma->pma_asngen = ci->ci_asn_gen;
+	TLB_COUNT(asn_assign);
+
+	/*
+	 * We have a new ASN, so we can skip any pending I-stream sync
+	 * on the way back out to user space.
+	 */
+	atomic_and_ulong(&pmap->pm_needisync, ~(1UL << ci->ci_cpuid));
 
 #ifdef DEBUG
 	if (pmapdebug & PDB_ASN)
 		printf("pmap_asn_alloc: assigning %u to pmap %p\n",
 		    pma->pma_asn, pmap);
 #endif
-
-	/*
-	 * Have a new ASN, so there's no need to sync the I-stream
-	 * on the way back out to userspace.
-	 */
-	atomic_and_ulong(&pmap->pm_needisync, ~(1UL << cpu_id));
-}
-
-#if defined(MULTIPROCESSOR)
-/******************** TLB shootdown code ********************/
-
-/*
- * pmap_tlb_shootdown:
- *
- *	Cause the TLB entry for pmap/va to be shot down.
- *
- *	NOTE: The pmap must be locked here.
- */
-void
-pmap_tlb_shootdown(pmap_t pmap, vaddr_t va, pt_entry_t pte, u_long *cpumaskp)
-{
-	struct pmap_tlb_shootdown_q *pq;
-	struct pmap_tlb_shootdown_job *pj;
-	struct cpu_info *ci, *self = curcpu();
-	u_long cpumask;
-	CPU_INFO_ITERATOR cii;
-
-	KASSERT((pmap == pmap_kernel()) || mutex_owned(&pmap->pm_lock));
-
-	cpumask = 0;
-
-	for (CPU_INFO_FOREACH(cii, ci)) {
-		if (ci == self)
-			continue;
-
-		/*
-		 * The pmap must be locked (unless its the kernel
-		 * pmap, in which case it is okay for it to be
-		 * unlocked), which prevents it from  becoming
-		 * active on any additional processors.  This makes
-		 * it safe to check for activeness.  If it's not
-		 * active on the processor in question, then just
-		 * mark it as needing a new ASN the next time it
-		 * does, saving the IPI.  We always have to send
-		 * the IPI for the kernel pmap.
-		 *
-		 * Note if it's marked active now, and it becomes
-		 * inactive by the time the processor receives
-		 * the IPI, that's okay, because it does the right
-		 * thing with it later.
-		 */
-		if (pmap != pmap_kernel() &&
-		    PMAP_ISACTIVE(pmap, ci->ci_cpuid) == 0) {
-			PMAP_INVALIDATE_ASN(pmap, ci->ci_cpuid);
-			continue;
-		}
-
-		cpumask |= 1UL << ci->ci_cpuid;
-
-		pq = &pmap_tlb_shootdown_q[ci->ci_cpuid];
-		mutex_spin_enter(&pq->pq_lock);
-
-		/*
-		 * Allocate a job.
-		 */
-		if (pq->pq_count < PMAP_TLB_SHOOTDOWN_MAXJOBS) {
-			pj = pool_cache_get(&pmap_tlb_shootdown_job_cache,
-			    PR_NOWAIT);
-		} else {
-			pj = NULL;
-		}
-
-		/*
-		 * If a global flush is already pending, we
-		 * don't really have to do anything else.
-		 */
-		pq->pq_pte |= pte;
-		if (pq->pq_tbia) {
-			mutex_spin_exit(&pq->pq_lock);
-			if (pj != NULL) {
-				pool_cache_put(&pmap_tlb_shootdown_job_cache,
-				    pj);
-			}
-			continue;
-		}
-		if (pj == NULL) {
-			/*
-			 * Couldn't allocate a job entry.  Just
-			 * tell the processor to kill everything.
-			 */
-			pq->pq_tbia = 1;
-		} else {
-			pj->pj_pmap = pmap;
-			pj->pj_va = va;
-			pj->pj_pte = pte;
-			pq->pq_count++;
-			TAILQ_INSERT_TAIL(&pq->pq_head, pj, pj_list);
-		}
-		mutex_spin_exit(&pq->pq_lock);
-	}
-
-	*cpumaskp |= cpumask;
-}
-
-/*
- * pmap_tlb_shootnow:
- *
- *	Process the TLB shootdowns that we have been accumulating
- *	for the specified processor set.
- */
-void
-pmap_tlb_shootnow(u_long cpumask)
-{
-
-	alpha_multicast_ipi(cpumask, ALPHA_IPI_SHOOTDOWN);
-}
-
-/*
- * pmap_do_tlb_shootdown:
- *
- *	Process pending TLB shootdown operations for this processor.
- */
-void
-pmap_do_tlb_shootdown(struct cpu_info *ci, struct trapframe *framep)
-{
-	u_long cpu_id = ci->ci_cpuid;
-	u_long cpu_mask = (1UL << cpu_id);
-	struct pmap_tlb_shootdown_q *pq = &pmap_tlb_shootdown_q[cpu_id];
-	struct pmap_tlb_shootdown_job *pj, *next;
-	TAILQ_HEAD(, pmap_tlb_shootdown_job) jobs;
-
-	TAILQ_INIT(&jobs);
-
-	mutex_spin_enter(&pq->pq_lock);
-	TAILQ_CONCAT(&jobs, &pq->pq_head, pj_list);
-	if (pq->pq_tbia) {
-		if (pq->pq_pte & PG_ASM)
-			ALPHA_TBIA();
-		else
-			ALPHA_TBIAP();
-		pq->pq_tbia = 0;
-		pq->pq_pte = 0;
-	} else {
-		TAILQ_FOREACH(pj, &jobs, pj_list) {
-			PMAP_INVALIDATE_TLB(pj->pj_pmap, pj->pj_va,
-			    pj->pj_pte & PG_ASM,
-			    pj->pj_pmap->pm_cpus & cpu_mask, cpu_id);
-		}
-		pq->pq_pte = 0;
-	}
-	pq->pq_count = 0;
-	mutex_spin_exit(&pq->pq_lock);
-
-	/* Free jobs back to the cache. */
-	for (pj = TAILQ_FIRST(&jobs); pj != NULL; pj = next) {
-		next = TAILQ_NEXT(pj, pj_list);
-		pool_cache_put(&pmap_tlb_shootdown_job_cache, pj);
-	}
+	return pma->pma_asn;
 }
-#endif /* MULTIPROCESSOR */

Index: src/sys/arch/alpha/alpha/vm_machdep.c
diff -u src/sys/arch/alpha/alpha/vm_machdep.c:1.115 src/sys/arch/alpha/alpha/vm_machdep.c:1.116
--- src/sys/arch/alpha/alpha/vm_machdep.c:1.115	Sun Aug 16 18:05:52 2020
+++ src/sys/arch/alpha/alpha/vm_machdep.c	Sat Aug 29 20:06:59 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: vm_machdep.c,v 1.115 2020/08/16 18:05:52 thorpej Exp $ */
+/* $NetBSD: vm_machdep.c,v 1.116 2020/08/29 20:06:59 thorpej Exp $ */
 
 /*
  * Copyright (c) 1994, 1995, 1996 Carnegie-Mellon University.
@@ -29,7 +29,7 @@
 
 #include <sys/cdefs.h>			/* RCS ID & Copyright macro defns */
 
-__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.115 2020/08/16 18:05:52 thorpej Exp $");
+__KERNEL_RCSID(0, "$NetBSD: vm_machdep.c,v 1.116 2020/08/29 20:06:59 thorpej Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -109,14 +109,12 @@ cpu_lwp_fork(struct lwp *l1, struct lwp 
 		pcb2->pcb_hw.apcb_usp = alpha_pal_rdusp();
 
 	/*
-	 * Put l2's lev1map into its PTBR so that it will be on its
-	 * own page tables as the SWPCTX to its PCB is made.  ASN
-	 * doesn't matter at this point; that will be handled on l2's
-	 * first pmap_activate() call.
+	 * Put l2 on the kernel's page tables until its first trip
+	 * through pmap_activate().
 	 */
-	pmap_t const pmap2 = l2->l_proc->p_vmspace->vm_map.pmap;
 	pcb2->pcb_hw.apcb_ptbr =
-	    ALPHA_K0SEG_TO_PHYS((vaddr_t)pmap2->pm_lev1map) >> PGSHIFT;
+	    ALPHA_K0SEG_TO_PHYS((vaddr_t)pmap_kernel()->pm_lev1map) >> PGSHIFT;
+	pcb2->pcb_hw.apcb_asn = PMAP_ASN_KERNEL;
 
 #ifdef DIAGNOSTIC
 	/*

Index: src/sys/arch/alpha/include/cpu.h
diff -u src/sys/arch/alpha/include/cpu.h:1.88 src/sys/arch/alpha/include/cpu.h:1.89
--- src/sys/arch/alpha/include/cpu.h:1.88	Sat Aug 29 19:06:33 2020
+++ src/sys/arch/alpha/include/cpu.h	Sat Aug 29 20:07:00 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: cpu.h,v 1.88 2020/08/29 19:06:33 thorpej Exp $ */
+/* $NetBSD: cpu.h,v 1.89 2020/08/29 20:07:00 thorpej Exp $ */
 
 /*-
  * Copyright (c) 1998, 1999, 2000, 2001 The NetBSD Foundation, Inc.
@@ -128,11 +128,9 @@ struct cpu_info {
 	struct trapframe *ci_db_regs;	/* registers for debuggers */
 	uint64_t ci_pcc_freq;		/* cpu cycles/second */
 
-#define	CPU_INFO_PMAP_DATA_NQWORDS	16
-
 	struct pmap *ci_pmap;		/* currently-activated pmap */
-					/* data used by pmap module */
-	u_long ci_pmap_data[CPU_INFO_PMAP_DATA_NQWORDS];
+	u_int ci_next_asn;		/* next ASN to assign */
+	u_long ci_asn_gen;		/* current ASN generation */
 
 #if defined(MULTIPROCESSOR)
 	volatile u_long ci_flags;	/* flags; see below */

Index: src/sys/arch/alpha/include/intr.h
diff -u src/sys/arch/alpha/include/intr.h:1.72 src/sys/arch/alpha/include/intr.h:1.73
--- src/sys/arch/alpha/include/intr.h:1.72	Sat Jan 14 00:35:37 2017
+++ src/sys/arch/alpha/include/intr.h	Sat Aug 29 20:07:00 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: intr.h,v 1.72 2017/01/14 00:35:37 christos Exp $ */
+/* $NetBSD: intr.h,v 1.73 2020/08/29 20:07:00 thorpej Exp $ */
 
 /*-
  * Copyright (c) 2000, 2001, 2002 The NetBSD Foundation, Inc.
@@ -160,13 +160,12 @@ _splraise(int s)
 #define	ALPHA_IPI_HALT			(1UL << 0)
 #define	ALPHA_IPI_MICROSET		(1UL << 1)
 #define	ALPHA_IPI_SHOOTDOWN		(1UL << 2)
-#define	ALPHA_IPI_IMB			(1UL << 3)
-#define	ALPHA_IPI_AST			(1UL << 4)
-#define	ALPHA_IPI_PAUSE			(1UL << 5)
-#define	ALPHA_IPI_XCALL			(1UL << 6)
-#define	ALPHA_IPI_GENERIC		(1UL << 7)
+#define	ALPHA_IPI_AST			(1UL << 3)
+#define	ALPHA_IPI_PAUSE			(1UL << 4)
+#define	ALPHA_IPI_XCALL			(1UL << 5)
+#define	ALPHA_IPI_GENERIC		(1UL << 6)
 
-#define	ALPHA_NIPIS			8	/* must not exceed 64 */
+#define	ALPHA_NIPIS			7	/* must not exceed 64 */
 
 struct cpu_info;
 struct trapframe;

Index: src/sys/arch/alpha/include/pmap.h
diff -u src/sys/arch/alpha/include/pmap.h:1.82 src/sys/arch/alpha/include/pmap.h:1.83
--- src/sys/arch/alpha/include/pmap.h:1.82	Thu Jul 23 19:23:27 2020
+++ src/sys/arch/alpha/include/pmap.h	Sat Aug 29 20:07:00 2020
@@ -1,4 +1,4 @@
-/* $NetBSD: pmap.h,v 1.82 2020/07/23 19:23:27 skrll Exp $ */
+/* $NetBSD: pmap.h,v 1.83 2020/08/29 20:07:00 thorpej Exp $ */
 
 /*-
  * Copyright (c) 1998, 1999, 2000, 2001, 2007 The NetBSD Foundation, Inc.
@@ -123,35 +123,41 @@
  *
  * Note pm_asn and pm_asngen are arrays allocated in pmap_create().
  * Their size is based on the PCS count from the HWRPB, and indexed
- * by processor ID (from `whami').
+ * by processor ID (from `whami').  This is all padded to COHERENCY_UNIT
+ * to avoid false sharing.
  *
- * The kernel pmap is a special case; it gets statically-allocated
+ * The kernel pmap is a special case; since the kernel uses only ASM
+ * mappings and uses a reserved ASN to keep the TLB clean, we don't
+ * allocate any ASN info for the kernel pmap at all.
  * arrays which hold enough for ALPHA_MAXPROCS.
  */
 struct pmap_asn_info {
 	unsigned int		pma_asn;	/* address space number */
+	unsigned int		pma_pad0;
 	unsigned long		pma_asngen;	/* ASN generation number */
+	unsigned long		pma_padN[(COHERENCY_UNIT / 8) - 2];
 };
 
-struct pmap {
-	TAILQ_ENTRY(pmap)	pm_list;	/* list of all pmaps */
-	pt_entry_t		*pm_lev1map;	/* level 1 map */
-	int			pm_count;	/* pmap reference count */
-	kmutex_t		pm_lock;	/* lock on pmap */
-	struct pmap_statistics	pm_stats;	/* pmap statistics */
-	unsigned long		pm_cpus;	/* mask of CPUs using pmap */
-	unsigned long		pm_needisync;	/* mask of CPUs needing isync */
-	struct pmap_asn_info	pm_asni[];	/* ASN information */
+struct pmap {	/* pmaps are aligned to COHERENCY_UNIT boundaries */
+		/* pmaps are locked by hashed mutexes */
+	pt_entry_t		*pm_lev1map;	/* [ 0] level 1 map */
+	unsigned long		pm_cpus;	/* [ 8] CPUs using pmap */
+	unsigned long		pm_needisync;	/* [16] CPUs needing isync */
+	struct pmap_statistics	pm_stats;	/* [32] statistics */
+	long			pm_count;	/* [40] reference count */
+	TAILQ_ENTRY(pmap)	pm_list;	/* [48] list of all pmaps */
+	/* -- COHERENCY_UNIT boundary -- */
+	struct pmap_asn_info	pm_asni[];	/* [64] ASN information */
 			/*	variable length		*/
 };
 
-/*
- * Compute the sizeof of a pmap structure.
- */
 #define	PMAP_SIZEOF(x)							\
 	(ALIGN(offsetof(struct pmap, pm_asni[(x)])))
 
-#define	PMAP_ASN_RESERVED	0	/* reserved for Lev1map users */
+#define	PMAP_ASN_KERNEL		0	/* kernel-reserved ASN */
+#define	PMAP_ASN_FIRST_USER	1	/* first user ASN */
+#define	PMAP_ASNGEN_INVALID	0	/* reserved (invalid) ASN generation */
+#define	PMAP_ASNGEN_INITIAL	1	/* first valid generatation */
 
 /*
  * For each struct vm_page, there is a list of all currently valid virtual
@@ -188,22 +194,12 @@ typedef struct pv_entry {
 #define	_PMAP_MAY_USE_PROM_CONSOLE
 #endif
 
-#if defined(MULTIPROCESSOR)
 struct cpu_info;
 struct trapframe;
 
-void	pmap_tlb_shootdown(pmap_t, vaddr_t, pt_entry_t, u_long *);
-void	pmap_tlb_shootnow(u_long);
-void	pmap_do_tlb_shootdown(struct cpu_info *, struct trapframe *);
-#define	PMAP_TLB_SHOOTDOWN_CPUSET_DECL		u_long shootset = 0;
-#define	PMAP_TLB_SHOOTDOWN(pm, va, pte)					\
-	pmap_tlb_shootdown((pm), (va), (pte), &shootset)
-#define	PMAP_TLB_SHOOTNOW()						\
-	pmap_tlb_shootnow(shootset)
-#else
-#define	PMAP_TLB_SHOOTDOWN_CPUSET_DECL		/* nothing */
-#define	PMAP_TLB_SHOOTDOWN(pm, va, pte)		/* nothing */
-#define	PMAP_TLB_SHOOTNOW()			/* nothing */
+void	pmap_init_cpu(struct cpu_info *);
+#if defined(MULTIPROCESSOR)
+void	pmap_tlb_shootdown_ipi(struct cpu_info *, struct trapframe *);
 #endif /* MULTIPROCESSOR */
 
 #define	pmap_resident_count(pmap)	((pmap)->pm_stats.resident_count)
@@ -331,17 +327,6 @@ pmap_l3pte(pmap_t pmap, vaddr_t v, pt_en
 }
 
 /*
- * Macros for locking pmap structures.
- *
- * Note that we if we access the kernel pmap in interrupt context, it
- * is only to update statistics.  Since stats are updated using atomic
- * operations, locking the kernel pmap is not necessary.  Therefore,
- * it is not necessary to block interrupts when locking pmap strucutres.
- */
-#define	PMAP_LOCK(pmap)		mutex_enter(&(pmap)->pm_lock)
-#define	PMAP_UNLOCK(pmap)	mutex_exit(&(pmap)->pm_lock)
-
-/*
  * Macro for processing deferred I-stream synchronization.
  *
  * The pmap module may defer syncing the user I-stream until the

Reply via email to