Module Name: src Committed By: mrg Date: Sun Mar 28 05:24:01 UTC 2010
Modified Files: src/sys/arch/sparc64/include: cpu.h src/sys/arch/sparc64/sparc64: cache.h ipifuncs.c pmap.c Log Message: - add a kmutex_t ci_ctx_lock to struct cpu_info, and initialise it in cpu_pmap_init() and replace pmap_ctx_lock usage with this new ci_ctx_lock - replace smp_dcache_flush_page_all() with smp_dcache_flush_page_cpuset(), that flushes only on the set of CPUs not, everyone. add new dcache_flush_page_cpuset() to flush this page from the D$ only on the specified set of CPUs. - add a cpuset to pmap_free_page() and use it when freeing PTE pages when a pmap is destroyed - introduce pmap_free_page_noflush(), and use it when we allocated a page for PTEs but didn't use it and don't need to flush it - don't bother with pmap_lock in pmap_extract(), the only potential issue is pseg_get() which is already safe tested on sb2000, sb2500 and ultra80 with a bunch of various heavy workloads, and seems to give a clear 1-2% speed up for high-forking / short lived processes, such as ./configure. To generate a diff of this commit: cvs rdiff -u -r1.89 -r1.90 src/sys/arch/sparc64/include/cpu.h cvs rdiff -u -r1.17 -r1.18 src/sys/arch/sparc64/sparc64/cache.h cvs rdiff -u -r1.35 -r1.36 src/sys/arch/sparc64/sparc64/ipifuncs.c cvs rdiff -u -r1.261 -r1.262 src/sys/arch/sparc64/sparc64/pmap.c Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/sys/arch/sparc64/include/cpu.h diff -u src/sys/arch/sparc64/include/cpu.h:1.89 src/sys/arch/sparc64/include/cpu.h:1.90 --- src/sys/arch/sparc64/include/cpu.h:1.89 Sat Mar 6 08:08:29 2010 +++ src/sys/arch/sparc64/include/cpu.h Sun Mar 28 05:24:00 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: cpu.h,v 1.89 2010/03/06 08:08:29 mrg Exp $ */ +/* $NetBSD: cpu.h,v 1.90 2010/03/28 05:24:00 mrg Exp $ */ /* * Copyright (c) 1992, 1993 @@ -151,7 +151,10 @@ * the right pointer and you get to the pmap segment tables. These are * physical addresses, of course. * + * ci_ctx_lock protects this CPUs context allocation/free. + * These are all allocated almost with in the same cacheline. */ + kmutex_t ci_ctx_lock; int ci_pmap_next_ctx; int ci_numctx; paddr_t *ci_ctxbusy; Index: src/sys/arch/sparc64/sparc64/cache.h diff -u src/sys/arch/sparc64/sparc64/cache.h:1.17 src/sys/arch/sparc64/sparc64/cache.h:1.18 --- src/sys/arch/sparc64/sparc64/cache.h:1.17 Mon Mar 8 08:59:06 2010 +++ src/sys/arch/sparc64/sparc64/cache.h Sun Mar 28 05:24:00 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: cache.h,v 1.17 2010/03/08 08:59:06 mrg Exp $ */ +/* $NetBSD: cache.h,v 1.18 2010/03/28 05:24:00 mrg Exp $ */ /* * Copyright (c) 1996 @@ -144,14 +144,16 @@ #ifdef MULTIPROCESSOR void smp_tlb_flush_pte(vaddr_t, struct pmap *); -void smp_dcache_flush_page_all(paddr_t pa); +void smp_dcache_flush_page_cpuset(paddr_t pa, sparc64_cpuset_t); void smp_blast_dcache(sparc64_cpuset_t); #define tlb_flush_pte(va,pm ) smp_tlb_flush_pte(va, pm) -#define dcache_flush_page_all(pa) smp_dcache_flush_page_all(pa) +#define dcache_flush_page_all(pa) smp_dcache_flush_page_cpuset(pa, cpus_active) +#define dcache_flush_page_cpuset(pa,cs) smp_dcache_flush_page_cpuset(pa, cs) #define blast_dcache() smp_blast_dcache(cpus_active) #else #define tlb_flush_pte(va,pm) sp_tlb_flush_pte(va, (pm)->pm_ctx[0]) #define dcache_flush_page_all(pa) dcache_flush_page(pa) +#define dcache_flush_page_cpuset(pa,cs) dcache_flush_page(pa) #define blast_dcache() sp_blast_dcache(dcache_size, \ dcache_line_size) #endif Index: src/sys/arch/sparc64/sparc64/ipifuncs.c diff -u src/sys/arch/sparc64/sparc64/ipifuncs.c:1.35 src/sys/arch/sparc64/sparc64/ipifuncs.c:1.36 --- src/sys/arch/sparc64/sparc64/ipifuncs.c:1.35 Mon Mar 8 08:59:06 2010 +++ src/sys/arch/sparc64/sparc64/ipifuncs.c Sun Mar 28 05:24:00 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: ipifuncs.c,v 1.35 2010/03/08 08:59:06 mrg Exp $ */ +/* $NetBSD: ipifuncs.c,v 1.36 2010/03/28 05:24:00 mrg Exp $ */ /*- * Copyright (c) 2004 The NetBSD Foundation, Inc. @@ -27,7 +27,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: ipifuncs.c,v 1.35 2010/03/08 08:59:06 mrg Exp $"); +__KERNEL_RCSID(0, "$NetBSD: ipifuncs.c,v 1.36 2010/03/28 05:24:00 mrg Exp $"); #include "opt_ddb.h" @@ -412,10 +412,10 @@ } /* - * Make sure this page is flushed from all CPUs. + * Make sure this page is flushed from all/some CPUs. */ void -smp_dcache_flush_page_all(paddr_t pa) +smp_dcache_flush_page_cpuset(paddr_t pa, sparc64_cpuset_t activecpus) { ipifunc_t func; @@ -424,7 +424,7 @@ else func = sparc64_ipi_dcache_flush_page_us; - sparc64_broadcast_ipi(func, pa, dcache_line_size); + sparc64_multicast_ipi(activecpus, func, pa, dcache_line_size); dcache_flush_page(pa); } Index: src/sys/arch/sparc64/sparc64/pmap.c diff -u src/sys/arch/sparc64/sparc64/pmap.c:1.261 src/sys/arch/sparc64/sparc64/pmap.c:1.262 --- src/sys/arch/sparc64/sparc64/pmap.c:1.261 Sun Mar 21 22:38:08 2010 +++ src/sys/arch/sparc64/sparc64/pmap.c Sun Mar 28 05:24:00 2010 @@ -1,4 +1,4 @@ -/* $NetBSD: pmap.c,v 1.261 2010/03/21 22:38:08 mrg Exp $ */ +/* $NetBSD: pmap.c,v 1.262 2010/03/28 05:24:00 mrg Exp $ */ /* * * Copyright (C) 1996-1999 Eduardo Horvath. @@ -26,7 +26,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.261 2010/03/21 22:38:08 mrg Exp $"); +__KERNEL_RCSID(0, "$NetBSD: pmap.c,v 1.262 2010/03/28 05:24:00 mrg Exp $"); #undef NO_VCACHE /* Don't forget the locked TLB in dostart */ #define HWREF @@ -323,14 +323,14 @@ #define pv_check() -static int pmap_get_page(paddr_t *p); -static void pmap_free_page(paddr_t pa); +static int pmap_get_page(paddr_t *); +static void pmap_free_page(paddr_t, sparc64_cpuset_t); +static void pmap_free_page_noflush(paddr_t); /* - * Global pmap lock. + * Global pmap locks. */ static kmutex_t pmap_lock; -static kmutex_t pmap_ctx_lock; static bool lock_available = false; /* @@ -1226,6 +1226,7 @@ { size_t ctxsize; + mutex_init(&ci->ci_ctx_lock, MUTEX_SPIN, IPL_VM); ci->ci_pmap_next_ctx = 1; #ifdef SUN4V #error find out if we have 16 or 13 bit context ids @@ -1295,7 +1296,6 @@ vm_num_phys = avail_end - avail_start; mutex_init(&pmap_lock, MUTEX_DEFAULT, IPL_NONE); - mutex_init(&pmap_ctx_lock, MUTEX_SPIN, IPL_VM); #if defined(USE_LOCKSAFE_PSEG_GETSET) mutex_init(&pseg_lock, MUTEX_SPIN, IPL_VM); #endif @@ -1410,6 +1410,9 @@ { #ifdef MULTIPROCESSOR struct cpu_info *ci; + sparc64_cpuset_t pmap_cpus_active; +#else +#define pmap_cpus_active 0 #endif struct vm_page *pg, *nextpg; @@ -1417,26 +1420,36 @@ return; } DPRINTF(PDB_DESTROY, ("pmap_destroy: freeing pmap %p\n", pm)); - mutex_enter(&pmap_ctx_lock); #ifdef MULTIPROCESSOR + CPUSET_CLEAR(pmap_cpus_active); for (ci = cpus; ci != NULL; ci = ci->ci_next) { - if (CPUSET_HAS(cpus_active, ci->ci_index)) - ctx_free(pm, ci); + /* XXXMRG: Move the lock inside one or both tests? */ + mutex_enter(&ci->ci_ctx_lock); + if (CPUSET_HAS(cpus_active, ci->ci_index)) { + if (pm->pm_ctx[ci->ci_index] > 0) { + CPUSET_ADD(pmap_cpus_active, ci->ci_index); + ctx_free(pm, ci); + } + } + mutex_exit(&ci->ci_ctx_lock); } #else - ctx_free(pm, curcpu()); + if (pmap_ctx(pm)) { + mutex_enter(&curcpu()->ci_ctx_lock); + ctx_free(pm, curcpu()); + mutex_exit(&curcpu()->ci_ctx_lock); + } #endif - mutex_exit(&pmap_ctx_lock); /* we could be a little smarter and leave pages zeroed */ for (pg = TAILQ_FIRST(&pm->pm_obj.memq); pg != NULL; pg = nextpg) { nextpg = TAILQ_NEXT(pg, listq.queue); TAILQ_REMOVE(&pm->pm_obj.memq, pg, listq.queue); KASSERT(pg->mdpage.mdpg_pvh.pv_pmap == NULL); - dcache_flush_page_all(VM_PAGE_TO_PHYS(pg)); + dcache_flush_page_cpuset(VM_PAGE_TO_PHYS(pg), pmap_cpus_active); uvm_pagefree(pg); } - pmap_free_page((paddr_t)(u_long)pm->pm_segs); + pmap_free_page((paddr_t)(u_long)pm->pm_segs, pmap_cpus_active); UVM_OBJ_DESTROY(&pm->pm_obj); pool_cache_put(&pmap_cache, pm); } @@ -1555,7 +1568,7 @@ /* We allocated a spare page but didn't use it. Free it. */ printf("pmap_kenter_pa: freeing unused page %llx\n", (long long)ptp); - pmap_free_page(ptp); + pmap_free_page_noflush(ptp); } #ifdef DEBUG i = ptelookup_va(va); @@ -1826,7 +1839,7 @@ /* We allocated a spare page but didn't use it. Free it. */ printf("pmap_enter: freeing unused page %llx\n", (long long)ptp); - pmap_free_page(ptp); + pmap_free_page_noflush(ptp); } if (dopv) { pmap_enter_pv(pm, va, pa, pg, npv); @@ -1924,22 +1937,36 @@ write_user_windows(); pm->pm_refs = 0; - mutex_enter(&pmap_ctx_lock); + /* + * XXXMRG: pmap_destroy() does exactly the same dance here. + * surely one of them isn't necessary? + */ #ifdef MULTIPROCESSOR CPUSET_CLEAR(pmap_cpus_active); for (ci = cpus; ci != NULL; ci = ci->ci_next) { + /* XXXMRG: Move the lock inside one or both tests? */ + mutex_enter(&ci->ci_ctx_lock); if (CPUSET_HAS(cpus_active, ci->ci_index)) { - if (pm->pm_ctx[ci->ci_index] > 0) + if (pm->pm_ctx[ci->ci_index] > 0) { CPUSET_ADD(pmap_cpus_active, ci->ci_index); - ctx_free(pm, ci); + ctx_free(pm, ci); + } } + mutex_exit(&ci->ci_ctx_lock); } #else - ctx_free(pm, curcpu()); + if (pmap_ctx(pm)) { + mutex_enter(&curcpu()->ci_ctx_lock); + ctx_free(pm, curcpu()); + mutex_exit(&curcpu()->ci_ctx_lock); + } #endif - mutex_exit(&pmap_ctx_lock); REMOVE_STAT(flushes); + /* + * XXXMRG: couldn't we do something less severe here, and + * only flush the right context on each CPU? + */ #ifdef MULTIPROCESSOR smp_blast_dcache(pmap_cpus_active); #else @@ -2021,7 +2048,8 @@ continue; /* - * if the pmap is being torn down, don't bother flushing. + * if the pmap is being torn down, don't bother flushing, + * we already have done so. */ if (!pm->pm_refs) @@ -2166,9 +2194,6 @@ *pap = pa; return TRUE; } else { - if (pm != pmap_kernel()) { - mutex_enter(&pmap_lock); - } data = pseg_get(pm, va); pa = data & TLB_PA_MASK; #ifdef DEBUG @@ -2200,9 +2225,6 @@ printf(" pseg_get: %lx\n", (long)pa); } #endif - if (pm != pmap_kernel()) { - mutex_exit(&pmap_lock); - } } if ((data & TLB_V) == 0) return (FALSE); @@ -3073,7 +3095,7 @@ KASSERT(pm != pmap_kernel()); KASSERT(pm == curproc->p_vmspace->vm_map.pmap); - mutex_enter(&pmap_ctx_lock); + mutex_enter(&curcpu()->ci_ctx_lock); ctx = curcpu()->ci_pmap_next_ctx++; /* @@ -3108,7 +3130,7 @@ curcpu()->ci_ctxbusy[ctx] = pm->pm_physaddr; LIST_INSERT_HEAD(&curcpu()->ci_pmap_ctxlist, pm, pm_list[cpu_number()]); pmap_ctx(pm) = ctx; - mutex_exit(&pmap_ctx_lock); + mutex_exit(&curcpu()->ci_ctx_lock); DPRINTF(PDB_CTX_ALLOC, ("ctx_alloc: cpu%d allocated ctx %d\n", cpu_number(), ctx)); return ctx; @@ -3123,7 +3145,7 @@ int oldctx; int cpunum; - KASSERT(mutex_owned(&pmap_ctx_lock)); + KASSERT(mutex_owned(&curcpu()->ci_ctx_lock)); #ifdef MULTIPROCESSOR cpunum = ci->ci_index; @@ -3382,14 +3404,21 @@ } static void -pmap_free_page(paddr_t pa) +pmap_free_page(paddr_t pa, sparc64_cpuset_t cs) { struct vm_page *pg = PHYS_TO_VM_PAGE(pa); - dcache_flush_page_all(pa); + dcache_flush_page_cpuset(pa, cs); uvm_pagefree(pg); } +static void +pmap_free_page_noflush(paddr_t pa) +{ + struct vm_page *pg = PHYS_TO_VM_PAGE(pa); + + uvm_pagefree(pg); +} #ifdef DDB @@ -3609,7 +3638,7 @@ pmap_remove(pmap_kernel(), va, va+1); pmap_update(pmap_kernel()); - pmap_free_page(pa); + pmap_free_page(pa, cpus_active); } #endif