Hi,

I found some patch by Art from 2008 that removes the APTE stuff.  The 
patch was for i386 but I have adapted it to amd64.

Instead of mapping the ptes of an inactive pmap in the APTE range, and 
then doing an unconditional remote TLB-flush on all CPUs, we just switch 
to the other pmap locally. This only causes a local TLB-flush.  I have 
also reordered some stuff so that we will usually send the 
TLB-shootdown-IPIs first, then do some local stuff, and then wait for the 
remote TLB-shootdown to finish.

The patch can definitely be optimized further (e.g. use the direct mapping 
in some or all cases). But maybe people want to take a look already. I 
would be interested in results on AMD hardware.

On some real Intel hardware with 4 cores, I get (with an older version of 
the patch, but I don't think it makes much difference):

- no significant difference for kernel builds
- doing 'make index' in ports on mfs:
  w/o patch:  6m29.27s real     1m21.96s user     3m18.99s system
  with patch: 6m12.54s real     1m23.65s user     3m3.43s system
- forktest n=40000 (fork+exit micro benchmark)
  w/o patch:  0m8.75s real     0m0.36s user     0m7.51s system
  with patch: 0m7.43s real     0m0.18s user     0m7.79s system

I expect the speed up to be larger with more CPUs.

On KVM the speed up is much larger than on bare metal, in the same range 
as with the paravirt patches (of course doing both is even better). I 
don't have any exact numbers right now, though. We have been running the 
patch on one of our build-VMs with 6 CPUs for over a week without any 
problems.

Cheers,
Stefan

============= forktest.c ====================== 
#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/wait.h>
#include <stdlib.h>


int main(int ac, char **av)
{
        int n = 40000;
        int active = 0;
        while (n-- > 0) {
                int status;
                while (active > 100) {
                        while (wait4(WAIT_ANY, &status, WNOHANG, NULL) > 0)
                                active--;
                        if (active > 100)
                                usleep(5000);
                }
                pid_t pid = fork();
                if (pid < 0) {
                        perror("fork failed");
                        exit(0);
                } else if (pid == 0) {
                        // child
                        exit(0);
                } else {
                        active++;
                }
        }
        return 0;
}
=============================================== 

And here is the diff:


diff --git a/sys/arch/amd64/amd64/pmap.c b/sys/arch/amd64/amd64/pmap.c
index 40b64e5..293dfc9 100644
--- a/sys/arch/amd64/amd64/pmap.c
+++ b/sys/arch/amd64/amd64/pmap.c
@@ -203,7 +203,6 @@ long nkptp[] = NKPTP_INITIALIZER;
 long nkptpmax[] = NKPTPMAX_INITIALIZER;
 long nbpd[] = NBPD_INITIALIZER;
 pd_entry_t *normal_pdes[] = PDES_INITIALIZER;
-pd_entry_t *alternate_pdes[] = APDES_INITIALIZER;
 
 /* int nkpde = NKPTP; */
 
@@ -289,7 +288,7 @@ void pmap_free_ptp(struct pmap *, struct vm_page *,
     vaddr_t, pt_entry_t *, pd_entry_t **, struct pg_to_free *);
 void pmap_freepage(struct pmap *, struct vm_page *, int, struct pg_to_free *);
 static boolean_t pmap_is_active(struct pmap *, int);
-void pmap_map_ptes(struct pmap *, pt_entry_t **, pd_entry_t ***);
+void pmap_map_ptes(struct pmap *, pt_entry_t **, pd_entry_t ***, paddr_t *);
 struct pv_entry *pmap_remove_pv(struct vm_page *, struct pmap *, vaddr_t);
 void pmap_do_remove(struct pmap *, vaddr_t, vaddr_t, int);
 boolean_t pmap_remove_pte(struct pmap *, struct vm_page *, pt_entry_t *,
@@ -299,14 +298,23 @@ void pmap_remove_ptes(struct pmap *, struct vm_page *, 
vaddr_t,
 #define PMAP_REMOVE_ALL                0       /* remove all mappings */
 #define PMAP_REMOVE_SKIPWIRED  1       /* skip wired mappings */
 
-void pmap_unmap_ptes(struct pmap *);
+void pmap_unmap_ptes(struct pmap *, paddr_t);
 boolean_t pmap_get_physpage(vaddr_t, int, paddr_t *);
 boolean_t pmap_pdes_valid(vaddr_t, pd_entry_t **, pd_entry_t *);
 void pmap_alloc_level(pd_entry_t **, vaddr_t, int, long *);
-void pmap_apte_flush(struct pmap *pmap);
 
 void pmap_sync_flags_pte(struct vm_page *, u_long);
 
+void   pmap_tlb_shootpage(struct pmap *, vaddr_t, int);
+void   pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t, int);
+void   pmap_tlb_shoottlb(struct pmap *, int);
+#ifdef MULTIPROCESSOR
+void   pmap_tlb_shootwait(void);
+#else
+#define        pmap_tlb_shootwait()
+#endif
+
+
 /*
  * p m a p   i n l i n e   h e l p e r   f u n c t i o n s
  */
@@ -349,55 +357,44 @@ pmap_sync_flags_pte(struct vm_page *pg, u_long pte)
        }
 }
 
-void
-pmap_apte_flush(struct pmap *pmap)
-{
-       pmap_tlb_shoottlb();
-       pmap_tlb_shootwait();
-}
-
 /*
  * pmap_map_ptes: map a pmap's PTEs into KVM
- *
- * => we lock enough pmaps to keep things locked in
- * => must be undone with pmap_unmap_ptes before returning
  */
 
 void
-pmap_map_ptes(struct pmap *pmap, pt_entry_t **ptepp, pd_entry_t ***pdeppp)
+pmap_map_ptes(struct pmap *pmap, pt_entry_t **ptepp, pd_entry_t ***pdeppp, 
paddr_t *save_cr3)
 {
-       pd_entry_t opde, npde;
+       paddr_t cr3 = rcr3();
 
-       /* if curpmap then we are always mapped */
-       if (pmap_is_curpmap(pmap)) {
-               *ptepp = PTE_BASE;
-               *pdeppp = normal_pdes;
-               return;
-       }
+       /* the kernel's pmap is always accessible */
+       if (pmap == pmap_kernel() || pmap->pm_pdirpa == cr3) {
+               *save_cr3 = 0;
+       } else {
+               *save_cr3 = cr3;
+
+               /*
+                * Not sure if we need this, but better be safe.
+                * We don't have the current pmap in order to unset its
+                * active bit, but this just means that we may receive
+                * an unneccessary cross-CPU TLB flush now and then.
+                */
+               x86_atomic_setbits_u64(&pmap->pm_cpus, (1ULL << cpu_number()));
 
-       /* need to load a new alternate pt space into curpmap? */
-       opde = *APDP_PDE;
-       if (!pmap_valid_entry(opde) || (opde & PG_FRAME) != pmap->pm_pdirpa) {
-               npde = (pd_entry_t) (pmap->pm_pdirpa | PG_RW | PG_V);
-               *APDP_PDE = npde;
-               if (pmap_valid_entry(opde))
-                       pmap_apte_flush(curpcb->pcb_pmap);
+               lcr3(pmap->pm_pdirpa);
        }
-       *ptepp = APTE_BASE;
-       *pdeppp = alternate_pdes;
+
+       *ptepp = PTE_BASE;
+       *pdeppp = normal_pdes;
+       return;
 }
 
 void
-pmap_unmap_ptes(struct pmap *pmap)
+pmap_unmap_ptes(struct pmap *pmap, paddr_t save_cr3)
 {
-       if (pmap_is_curpmap(pmap))
-               return;
-
-#if defined(MULTIPROCESSOR)
-       *APDP_PDE = 0;
-       pmap_apte_flush(curpcb->pcb_pmap);
-#endif
-       COUNT(apdp_pde_unmap);
+       if (save_cr3 != 0) {
+               x86_atomic_clearbits_u64(&pmap->pm_cpus, (1ULL << 
cpu_number()));
+               lcr3(save_cr3);
+       }
 }
 
 /*
@@ -442,7 +439,7 @@ pmap_kenter_pa(vaddr_t va, paddr_t pa, vm_prot_t prot)
                if (pa & PMAP_NOCACHE && (opte & PG_N) == 0)
                        wbinvd();
                /* This shouldn't happen */
-               pmap_tlb_shootpage(pmap_kernel(), va);
+               pmap_tlb_shootpage(pmap_kernel(), va, 1);
                pmap_tlb_shootwait();
        }
 }
@@ -476,7 +473,7 @@ pmap_kremove(vaddr_t sva, vsize_t len)
                KASSERT((opte & PG_PVLIST) == 0);
        }
 
-       pmap_tlb_shootrange(pmap_kernel(), sva, eva);
+       pmap_tlb_shootrange(pmap_kernel(), sva, eva, 1);
        pmap_tlb_shootwait();
 }
 
@@ -822,11 +819,13 @@ pmap_free_ptp(struct pmap *pmap, struct vm_page *ptp, 
vaddr_t va,
                invaladdr = level == 1 ? (vaddr_t)ptes :
                    (vaddr_t)pdes[level - 2];
                pmap_tlb_shootpage(curpcb->pcb_pmap,
-                   invaladdr + index * PAGE_SIZE);
+                   invaladdr + index * PAGE_SIZE,
+                   pmap_is_curpmap(curpcb->pcb_pmap));
 #if defined(MULTIPROCESSOR)
                invaladdr = level == 1 ? (vaddr_t)PTE_BASE :
                    (vaddr_t)normal_pdes[level - 2];
-               pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE);
+               pmap_tlb_shootpage(pmap, invaladdr + index * PAGE_SIZE,
+                   pmap_is_curpmap(curpcb->pcb_pmap));
 #endif
                if (level < PTP_LEVELS - 1) {
                        ptp = pmap_find_ptp(pmap, va, (paddr_t)-1, level + 1);
@@ -1054,11 +1053,7 @@ pmap_destroy(struct pmap *pmap)
                }
        }
 
-       /*
-        * MULTIPROCESSOR -- no need to flush out of other processors'
-        * APTE space because we do that in pmap_unmap_ptes().
-        */
-       /* XXX: need to flush it out of other processor's APTE space? */
+       /* XXX: need to flush it out of other processor's space? */
        pool_put(&pmap_pdp_pool, pmap->pm_pdir);
 
        pool_put(&pmap_pmap_pool, pmap);
@@ -1132,7 +1127,7 @@ pmap_pdes_valid(vaddr_t va, pd_entry_t **pdes, pd_entry_t 
*lastpde)
        for (i = PTP_LEVELS; i > 1; i--) {
                index = pl_i(va, i);
                pde = pdes[i - 2][index];
-               if ((pde & PG_V) == 0)
+               if (!pmap_valid_entry(pde))
                        return FALSE;
        }
        if (lastpde != NULL)
@@ -1149,6 +1144,7 @@ pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
 {
        pt_entry_t *ptes, pte;
        pd_entry_t pde, **pdes;
+       paddr_t scr3;
 
        if (pmap == pmap_kernel() && va >= PMAP_DIRECT_BASE &&
            va < PMAP_DIRECT_END) {
@@ -1156,7 +1152,7 @@ pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t *pap)
                return (TRUE);
        }
 
-       pmap_map_ptes(pmap, &ptes, &pdes);
+       pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
        if (pmap_pdes_valid(va, pdes, &pde) == FALSE) {
                return FALSE;
        }
@@ -1164,14 +1160,14 @@ pmap_extract(struct pmap *pmap, vaddr_t va, paddr_t 
*pap)
        if (pde & PG_PS) {
                if (pap != NULL)
                        *pap = (pde & PG_LGFRAME) | (va & 0x1fffff);
-               pmap_unmap_ptes(pmap);
+               pmap_unmap_ptes(pmap, scr3);
                return (TRUE);
        }
 
        pte = ptes[pl1_i(va)];
-       pmap_unmap_ptes(pmap);
+       pmap_unmap_ptes(pmap, scr3);
 
-       if (__predict_true((pte & PG_V) != 0)) {
+       if (__predict_true(pmap_valid_entry(pte))) {
                if (pap != NULL)
                        *pap = (pte & PG_FRAME) | (va & 0xfff);
                return (TRUE);
@@ -1439,11 +1435,12 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t 
eva, int flags)
        vaddr_t va;
        int shootall = 0;
        struct pg_to_free empty_ptps;
+       paddr_t scr3;
 
        TAILQ_INIT(&empty_ptps);
 
        PMAP_MAP_TO_HEAD_LOCK();
-       pmap_map_ptes(pmap, &ptes, &pdes);
+       pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
 
        /*
         * removing one page?  take shortcut function.
@@ -1481,11 +1478,11 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t 
eva, int flags)
                        if (result && ptp && ptp->wire_count <= 1)
                                pmap_free_ptp(pmap, ptp, sva, ptes, pdes,
                                    &empty_ptps);
-                       pmap_tlb_shootpage(pmap, sva);
+                       pmap_tlb_shootpage(pmap, sva, scr3 == 0);
                }
 
+               pmap_unmap_ptes(pmap, scr3);
                pmap_tlb_shootwait();
-               pmap_unmap_ptes(pmap);
                PMAP_MAP_TO_HEAD_UNLOCK();
 
                while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
@@ -1551,13 +1548,12 @@ pmap_do_remove(struct pmap *pmap, vaddr_t sva, vaddr_t 
eva, int flags)
        }
 
        if (shootall)
-               pmap_tlb_shoottlb();
+               pmap_tlb_shoottlb(pmap, scr3 == 0);
        else
-               pmap_tlb_shootrange(pmap, sva, eva);
+               pmap_tlb_shootrange(pmap, sva, eva, scr3 == 0);
 
+       pmap_unmap_ptes(pmap, scr3);
        pmap_tlb_shootwait();
-
-       pmap_unmap_ptes(pmap);
        PMAP_MAP_TO_HEAD_UNLOCK();
 
        while ((ptp = TAILQ_FIRST(&empty_ptps)) != NULL) {
@@ -1583,6 +1579,7 @@ pmap_page_remove(struct vm_page *pg)
 #endif
        struct pg_to_free empty_ptps;
        struct vm_page *ptp;
+       paddr_t scr3;
 
        TAILQ_INIT(&empty_ptps);
 
@@ -1591,7 +1588,7 @@ pmap_page_remove(struct vm_page *pg)
        while ((pve = pg->mdpage.pv_list) != NULL) {
                pg->mdpage.pv_list = pve->pv_next;
 
-               pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);
+               pmap_map_ptes(pve->pv_pmap, &ptes, &pdes, &scr3);
 
 #ifdef DIAGNOSTIC
                if (pve->pv_ptp && pmap_pdes_valid(pve->pv_va, pdes, &pde) &&
@@ -1614,7 +1611,7 @@ pmap_page_remove(struct vm_page *pg)
                        pve->pv_pmap->pm_stats.wired_count--;
                pve->pv_pmap->pm_stats.resident_count--;
 
-               pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
+               pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, scr3 == 0);
 
                pmap_sync_flags_pte(pg, opte);
 
@@ -1626,7 +1623,7 @@ pmap_page_remove(struct vm_page *pg)
                                    pve->pv_va, ptes, pdes, &empty_ptps);
                        }
                }
-               pmap_unmap_ptes(pve->pv_pmap);
+               pmap_unmap_ptes(pve->pv_pmap, scr3);
                pool_put(&pmap_pv_pool, pve);
        }
 
@@ -1657,6 +1654,7 @@ pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
        pt_entry_t *ptes, pte;
        pd_entry_t **pdes;
        u_long mybits, testflags;
+       paddr_t scr3;
 
        testflags = pmap_pte2flags(testbits);
 
@@ -1667,9 +1665,9 @@ pmap_test_attrs(struct vm_page *pg, unsigned int testbits)
        mybits = 0;
        for (pve = pg->mdpage.pv_list; pve != NULL && mybits == 0;
            pve = pve->pv_next) {
-               pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);
+               pmap_map_ptes(pve->pv_pmap, &ptes, &pdes, &scr3);
                pte = ptes[pl1_i(pve->pv_va)];
-               pmap_unmap_ptes(pve->pv_pmap);
+               pmap_unmap_ptes(pve->pv_pmap, scr3);
                mybits |= (pte & testbits);
        }
        PMAP_HEAD_TO_MAP_UNLOCK();
@@ -1696,6 +1694,7 @@ pmap_clear_attrs(struct vm_page *pg, unsigned long 
clearbits)
        pd_entry_t **pdes;
        u_long clearflags;
        int result;
+       paddr_t scr3;
 
        clearflags = pmap_pte2flags(clearbits);
 
@@ -1706,7 +1705,7 @@ pmap_clear_attrs(struct vm_page *pg, unsigned long 
clearbits)
                atomic_clearbits_int(&pg->pg_flags, clearflags);
 
        for (pve = pg->mdpage.pv_list; pve != NULL; pve = pve->pv_next) {
-               pmap_map_ptes(pve->pv_pmap, &ptes, &pdes);
+               pmap_map_ptes(pve->pv_pmap, &ptes, &pdes, &scr3);
 #ifdef DIAGNOSTIC
                if (!pmap_pdes_valid(pve->pv_va, pdes, NULL))
                        panic("pmap_change_attrs: mapping without PTP "
@@ -1718,9 +1717,9 @@ pmap_clear_attrs(struct vm_page *pg, unsigned long 
clearbits)
                        result = 1;
                        pmap_pte_clearbits(&ptes[pl1_i(pve->pv_va)],
                            (opte & clearbits));
-                       pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va);
+                       pmap_tlb_shootpage(pve->pv_pmap, pve->pv_va, scr3 == 0);
                }
-               pmap_unmap_ptes(pve->pv_pmap);
+               pmap_unmap_ptes(pve->pv_pmap, scr3);
        }
 
        PMAP_HEAD_TO_MAP_UNLOCK();
@@ -1763,8 +1762,9 @@ pmap_write_protect(struct pmap *pmap, vaddr_t sva, 
vaddr_t eva, vm_prot_t prot)
        vaddr_t blockend;
        int shootall = 0;
        vaddr_t va;
+       paddr_t scr3;
 
-       pmap_map_ptes(pmap, &ptes, &pdes);
+       pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
 
        /* should be ok, but just in case ... */
        sva &= PG_FRAME;
@@ -1808,7 +1808,7 @@ pmap_write_protect(struct pmap *pmap, vaddr_t sva, 
vaddr_t eva, vm_prot_t prot)
                epte = &ptes[pl1_i(blockend)];
 
                for (/*null */; spte < epte ; spte++) {
-                       if (!(*spte & PG_V))
+                       if (!pmap_valid_entry(*spte))
                                continue;
                        pmap_pte_clearbits(spte, PG_RW);
                        pmap_pte_setbits(spte, nx);
@@ -1816,13 +1816,13 @@ pmap_write_protect(struct pmap *pmap, vaddr_t sva, 
vaddr_t eva, vm_prot_t prot)
        }
 
        if (shootall)
-               pmap_tlb_shoottlb();
+               pmap_tlb_shoottlb(pmap, scr3 == 0);
        else
-               pmap_tlb_shootrange(pmap, sva, eva);
+               pmap_tlb_shootrange(pmap, sva, eva, scr3 == 0);
 
-       pmap_tlb_shootwait();
+       pmap_unmap_ptes(pmap, scr3);
 
-       pmap_unmap_ptes(pmap);
+       pmap_tlb_shootwait();
 }
 
 /*
@@ -1840,8 +1840,9 @@ pmap_unwire(struct pmap *pmap, vaddr_t va)
 {
        pt_entry_t *ptes;
        pd_entry_t **pdes;
+       paddr_t scr3;
 
-       pmap_map_ptes(pmap, &ptes, &pdes);
+       pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
 
        if (pmap_pdes_valid(va, pdes, NULL)) {
 
@@ -1859,7 +1860,7 @@ pmap_unwire(struct pmap *pmap, vaddr_t va)
                               "didn't change!\n", pmap, va);
                }
 #endif
-               pmap_unmap_ptes(pmap);
+               pmap_unmap_ptes(pmap, scr3);
        }
 #ifdef DIAGNOSTIC
        else {
@@ -1917,12 +1918,13 @@ pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, 
vm_prot_t prot, int flags)
        boolean_t nocache = (pa & PMAP_NOCACHE) != 0;
        boolean_t wc = (pa & PMAP_WC) != 0;
        int error;
+       paddr_t scr3;
 
        KASSERT(!(wc && nocache));
        pa &= PMAP_PA_MASK;
 
 #ifdef DIAGNOSTIC
-       if (va == (vaddr_t) PDP_BASE || va == (vaddr_t) APDP_BASE)
+       if (va == (vaddr_t) PDP_BASE)
                panic("pmap_enter: trying to map over PDP/APDP!");
 
        /* sanity check: kernel PTPs should already have been pre-allocated */
@@ -1939,7 +1941,7 @@ pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, 
vm_prot_t prot, int flags)
         * map in ptes and get a pointer to our PTP (unless we are the kernel)
         */
 
-       pmap_map_ptes(pmap, &ptes, &pdes);
+       pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
        if (pmap == pmap_kernel()) {
                ptp = NULL;
        } else {
@@ -2109,17 +2111,17 @@ enter_now:
         * If we changed anything other than modified/used bits,
         * flush the TLB.  (is this overkill?)
         */
-       if (opte & PG_V) {
+       if (pmap_valid_entry(opte)) {
                if (nocache && (opte & PG_N) == 0)
                        wbinvd();
-               pmap_tlb_shootpage(pmap, va);
+               pmap_tlb_shootpage(pmap, va, scr3 == 0);
                pmap_tlb_shootwait();
        }
 
        error = 0;
 
 out:
-       pmap_unmap_ptes(pmap);
+       pmap_unmap_ptes(pmap, scr3);
        PMAP_MAP_TO_HEAD_UNLOCK();
 
        return error;
@@ -2340,6 +2342,7 @@ pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
        pt_entry_t *ptes, *pte;
        pd_entry_t **pdes;
        vaddr_t blkendva;
+       paddr_t scr3;
 
        /*
         * if end is out of range truncate.
@@ -2351,7 +2354,7 @@ pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
 
 
        PMAP_MAP_TO_HEAD_LOCK();
-       pmap_map_ptes(pmap, &ptes, &pdes);
+       pmap_map_ptes(pmap, &ptes, &pdes, &scr3);
 
        /*
         * dumping a range of pages: we dump in PTP sized blocks (4MB)
@@ -2376,7 +2379,7 @@ pmap_dump(struct pmap *pmap, vaddr_t sva, vaddr_t eva)
                               sva, *pte, *pte & PG_FRAME);
                }
        }
-       pmap_unmap_ptes(pmap);
+       pmap_unmap_ptes(pmap, scr3);
        PMAP_MAP_TO_HEAD_UNLOCK();
 }
 #endif
@@ -2417,7 +2420,7 @@ volatile vaddr_t tlb_shoot_addr1;
 volatile vaddr_t tlb_shoot_addr2;
 
 void
-pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
+pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
 {
        struct cpu_info *ci, *self = curcpu();
        CPU_INFO_ITERATOR cii;
@@ -2449,12 +2452,12 @@ pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
                splx(s);
        }
 
-       if (pmap_is_curpmap(pm))
+       if (shootself)
                pmap_update_pg(va);
 }
 
 void
-pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
+pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
 {
        struct cpu_info *ci, *self = curcpu();
        CPU_INFO_ITERATOR cii;
@@ -2488,13 +2491,13 @@ pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, 
vaddr_t eva)
                splx(s);
        }
 
-       if (pmap_is_curpmap(pm))
+       if (shootself)
                for (va = sva; va < eva; va += PAGE_SIZE)
                        pmap_update_pg(va);
 }
 
 void
-pmap_tlb_shoottlb(void)
+pmap_tlb_shoottlb(struct pmap *pm, int shootself)
 {
        struct cpu_info *ci, *self = curcpu();
        CPU_INFO_ITERATOR cii;
@@ -2502,7 +2505,7 @@ pmap_tlb_shoottlb(void)
        u_int64_t mask = 0;
 
        CPU_INFO_FOREACH(cii, ci) {
-               if (ci == self || !(ci->ci_flags & CPUF_RUNNING))
+               if (ci == self || !pmap_is_active(pm, ci->ci_cpuid) || 
!(ci->ci_flags & CPUF_RUNNING))
                        continue;
                mask |= (1ULL << ci->ci_cpuid);
                wait++;
@@ -2525,7 +2528,8 @@ pmap_tlb_shoottlb(void)
                splx(s);
        }
 
-       tlbflush();
+       if (shootself)
+               tlbflush();
 }
 
 void
@@ -2538,26 +2542,30 @@ pmap_tlb_shootwait(void)
 #else
 
 void
-pmap_tlb_shootpage(struct pmap *pm, vaddr_t va)
+pmap_tlb_shootpage(struct pmap *pm, vaddr_t va, int shootself)
 {
-       if (pmap_is_curpmap(pm))
+       if (shootself)
                pmap_update_pg(va);
 
 }
 
 void
-pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva)
+pmap_tlb_shootrange(struct pmap *pm, vaddr_t sva, vaddr_t eva, int shootself)
 {
        vaddr_t va;
 
+       if (!shootself)
+               return;
+
        for (va = sva; va < eva; va += PAGE_SIZE)
                pmap_update_pg(va);     
 
 }
 
 void
-pmap_tlb_shoottlb(void)
+pmap_tlb_shoottlb(struct pmap *pm, int shootself)
 {
-       tlbflush();
+       if (shootself)
+               tlbflush();
 }
 #endif /* MULTIPROCESSOR */
diff --git a/sys/arch/amd64/include/pmap.h b/sys/arch/amd64/include/pmap.h
index b3ba005..d802888 100644
--- a/sys/arch/amd64/include/pmap.h
+++ b/sys/arch/amd64/include/pmap.h
@@ -155,44 +155,34 @@
 #define L4_SLOT_PTE            255
 #define L4_SLOT_KERN           256
 #define L4_SLOT_KERNBASE       511
-#define L4_SLOT_APTE           510
 #define L4_SLOT_DIRECT         509
 
 #define PDIR_SLOT_KERN         L4_SLOT_KERN
 #define PDIR_SLOT_PTE          L4_SLOT_PTE
-#define PDIR_SLOT_APTE         L4_SLOT_APTE
 #define PDIR_SLOT_DIRECT       L4_SLOT_DIRECT
 
 /*
  * the following defines give the virtual addresses of various MMU
  * data structures:
- * PTE_BASE and APTE_BASE: the base VA of the linear PTE mappings
- * PTD_BASE and APTD_BASE: the base VA of the recursive mapping of the PTD
- * PDP_PDE and APDP_PDE: the VA of the PDE that points back to the PDP/APDP
+ * PTE_BASE: the base VA of the linear PTE mappings
+ * PTD_BASE: the base VA of the recursive mapping of the PTD
+ * PDP_PDE: the VA of the PDE that points back to the PDP
  *
  */
 
 #define PTE_BASE  ((pt_entry_t *) (L4_SLOT_PTE * NBPD_L4))
-#define APTE_BASE ((pt_entry_t *) (VA_SIGN_NEG((L4_SLOT_APTE * NBPD_L4))))
 #define PMAP_DIRECT_BASE       (VA_SIGN_NEG((L4_SLOT_DIRECT * NBPD_L4)))
 #define PMAP_DIRECT_END                (VA_SIGN_NEG(((L4_SLOT_DIRECT + 1) * 
NBPD_L4)))
 
 #define L1_BASE                PTE_BASE
-#define AL1_BASE       APTE_BASE
 
 #define L2_BASE ((pd_entry_t *)((char *)L1_BASE + L4_SLOT_PTE * NBPD_L3))
 #define L3_BASE ((pd_entry_t *)((char *)L2_BASE + L4_SLOT_PTE * NBPD_L2))
 #define L4_BASE ((pd_entry_t *)((char *)L3_BASE + L4_SLOT_PTE * NBPD_L1))
 
-#define AL2_BASE ((pd_entry_t *)((char *)AL1_BASE + L4_SLOT_PTE * NBPD_L3))
-#define AL3_BASE ((pd_entry_t *)((char *)AL2_BASE + L4_SLOT_PTE * NBPD_L2))
-#define AL4_BASE ((pd_entry_t *)((char *)AL3_BASE + L4_SLOT_PTE * NBPD_L1))
-
 #define PDP_PDE                (L4_BASE + PDIR_SLOT_PTE)
-#define APDP_PDE       (L4_BASE + PDIR_SLOT_APTE)
 
 #define PDP_BASE       L4_BASE
-#define APDP_BASE      AL4_BASE
 
 #define NKL4_MAX_ENTRIES       (unsigned long)1
 #define NKL3_MAX_ENTRIES       (unsigned long)(NKL4_MAX_ENTRIES * 512)
@@ -249,7 +239,6 @@
                                  NKL3_MAX_ENTRIES, NKL4_MAX_ENTRIES }
 #define NBPD_INITIALIZER       { NBPD_L1, NBPD_L2, NBPD_L3, NBPD_L4 }
 #define PDES_INITIALIZER       { L2_BASE, L3_BASE, L4_BASE }
-#define APDES_INITIALIZER      { AL2_BASE, AL3_BASE, AL4_BASE }
 
 /*
  * PTP macros:
@@ -412,15 +401,6 @@ void               pmap_write_protect(struct pmap *, 
vaddr_t,
 
 vaddr_t reserve_dumppages(vaddr_t); /* XXX: not a pmap fn */
 
-void   pmap_tlb_shootpage(struct pmap *, vaddr_t);
-void   pmap_tlb_shootrange(struct pmap *, vaddr_t, vaddr_t);
-void   pmap_tlb_shoottlb(void);
-#ifdef MULTIPROCESSOR
-void   pmap_tlb_shootwait(void);
-#else
-#define        pmap_tlb_shootwait()
-#endif
-
 paddr_t        pmap_prealloc_lowmem_ptps(paddr_t);
 
 void   pagezero(vaddr_t);

Reply via email to