Next try. A problem with the last version was that amaps for shared mappings still would grow pretty large: establishing a shared mapping causes uvm to immediately allocate an amap for the whole range.
That's a problem when changing vmm to use shared memory to allow vmd(8) to access the memory of a guest with lots of RAM. This diff allows amaps to organize slots in clusters that are allocated on demand. A virtual memory range that only contains a few pages will consume only small amounts of kmem for the amap. - Amaps are clustered in chunks of 16 slots now, with a bitmap per chunk to keep track of used slots. That's cheaper than the current impl. Chunks are allocated b pool(9). - The chunks are organized in a hash table. The number of buckets is computed such that the max. number of collisions is in log2(slots). - Small amaps < 16 slots embed the cluster directly in the amap. The extra code to handle this special case is easy IMO. Though the amap definition got more ugly. - libkvm needs to be recompiled as well because the amap layout has changed - make -j4 build has become slightly faster on my machine 21m13.20s real 33m59.18s user 20m17.36s system vs. 20m35.41s real 33m56.55s user 18m55.17s system With this diff it should be harder to run a machine out of kmem because of large amap allocations caused by creative use of mmap(). So I welcome testing and comments. Index: lib/libkvm/kvm_proc.c =================================================================== RCS file: /cvs/src/lib/libkvm/kvm_proc.c,v retrieving revision 1.52 diff -u -p -r1.52 kvm_proc.c --- lib/libkvm/kvm_proc.c 22 Oct 2014 04:13:35 -0000 1.52 +++ lib/libkvm/kvm_proc.c 22 Apr 2016 15:49:54 -0000 @@ -108,6 +108,56 @@ static int proc_verify(kvm_t *, const st static void ps_str_a(struct ps_strings *, u_long *, int *); static void ps_str_e(struct ps_strings *, u_long *, int *); +static struct vm_anon * +_kvm_findanon(kvm_t *kd, struct vm_amap *amapp, int slot) +{ + u_long addr; + int bucket; + struct vm_amap amap; + struct vm_amap_chunk chunk, *chunkp; + struct vm_anon *anonp; + + addr = (u_long)amapp; + if (KREAD(kd, addr, &amap)) + return (NULL); + + /* sanity-check slot number */ + if (slot > amap.am_nslot) + return (NULL); + + if (UVM_AMAP_SMALL(&amap)) + chunkp = &amapp->am_small; + else { + bucket = AMAP_BUCKET(&amap, slot); + addr = (u_long)(amap.am_buckets + bucket); + if (KREAD(kd, addr, &chunkp)) + return (NULL); + + while (chunkp != NULL) { + addr = (u_long)chunkp; + if (KREAD(kd, addr, &chunk)) + return (NULL); + + if (UVM_AMAP_BUCKET(&amap, chunk.ac_baseslot) != + bucket) + return (NULL); + if (slot >= chunk.ac_baseslot && + slot < chunk.ac_baseslot + chunk.ac_nslot) + break; + + chunkp = TAILQ_NEXT(&chunk, ac_list); + } + if (chunkp == NULL) + return (NULL); + } + + addr = (u_long)&chunkp->ac_anon[UVM_AMAP_SLOTIDX(slot)]; + if (KREAD(kd, addr, &anonp)) + return (NULL); + + return (anonp); +} + static char * _kvm_ureadm(kvm_t *kd, const struct kinfo_proc *p, u_long va, u_long *cnt) { @@ -115,7 +165,6 @@ _kvm_ureadm(kvm_t *kd, const struct kinf struct vmspace vm; struct vm_anon *anonp, anon; struct vm_map_entry vme; - struct vm_amap amap; struct vm_page pg; if (kd->swapspc == 0) { @@ -153,18 +202,11 @@ _kvm_ureadm(kvm_t *kd, const struct kinf if (vme.aref.ar_amap == NULL) return (NULL); - addr = (u_long)vme.aref.ar_amap; - if (KREAD(kd, addr, &amap)) - return (NULL); - offset = va - vme.start; slot = offset / kd->nbpg + vme.aref.ar_pageoff; - /* sanity-check slot number */ - if (slot > amap.am_nslot) - return (NULL); - addr = (u_long)amap.am_anon + (offset / kd->nbpg) * sizeof(anonp); - if (KREAD(kd, addr, &anonp)) + anonp = _kvm_findanon(kd, vme.aref.ar_amap, slot); + if (anonp == NULL) return (NULL); addr = (u_long)anonp; Index: sys/uvm/uvm_amap.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_amap.c,v retrieving revision 1.66 diff -u -p -r1.66 uvm_amap.c --- sys/uvm/uvm_amap.c 16 Apr 2016 18:39:31 -0000 1.66 +++ sys/uvm/uvm_amap.c 22 Apr 2016 15:49:58 -0000 @@ -44,20 +44,19 @@ #include <uvm/uvm_swap.h> /* - * pool for allocation of vm_map structures. note that in order to + * pools for allocation of vm_amap structures. note that in order to * avoid an endless loop, the amap pool's allocator cannot allocate * memory from an amap (it currently goes through the kernel uobj, so * we are ok). */ struct pool uvm_amap_pool; - -/* Pools for amap slots for the most common amap slot sizes */ -struct pool uvm_amap_slot_pools[UVM_AMAP_CHUNK]; +struct pool uvm_small_amap_pool[UVM_AMAP_CHUNK]; +struct pool uvm_amap_chunk_pool; LIST_HEAD(, vm_amap) amap_list; -static char amap_slot_pool_names[UVM_AMAP_CHUNK][13]; +static char amap_small_pool_names[UVM_AMAP_CHUNK][9]; #define MALLOC_SLOT_UNIT (2 * sizeof(int) + sizeof(struct vm_anon *)) @@ -65,10 +64,12 @@ static char amap_slot_pool_names[UVM_AMA * local functions */ -static struct vm_amap *amap_alloc1(int, int); +static struct vm_amap *amap_alloc1(int, int, int); static __inline void amap_list_insert(struct vm_amap *); static __inline void amap_list_remove(struct vm_amap *); +struct vm_amap_chunk *amap_chunk_get(struct vm_amap *, int, int); + static __inline void amap_list_insert(struct vm_amap *amap) { @@ -81,6 +82,72 @@ amap_list_remove(struct vm_amap *amap) LIST_REMOVE(amap, am_list); } +/* + * amap_chunk_get: lookup a chunk for slot. if create is non-zero, + * the chunk is created if it does not yet exist. + * + * => returns the chunk on success or NULL on error + */ +struct vm_amap_chunk * +amap_chunk_get(struct vm_amap *amap, int slot, int create) +{ + int bucket = UVM_AMAP_BUCKET(amap, slot); + int baseslot = AMAP_BASE_SLOT(slot); + int n; + struct vm_amap_chunk *chunk, *newchunk, *pchunk = NULL; + + if (UVM_AMAP_SMALL(amap)) + return &amap->am_small; + + for (chunk = amap->am_buckets[bucket]; chunk != NULL; + chunk = TAILQ_NEXT(chunk, ac_list)) { + if (UVM_AMAP_BUCKET(amap, chunk->ac_baseslot) != bucket) + break; + if (chunk->ac_baseslot == baseslot) + return chunk; + pchunk = chunk; + } + if (!create) + return NULL; + + if (amap->am_nslot - baseslot >= UVM_AMAP_CHUNK) + n = UVM_AMAP_CHUNK; + else + n = amap->am_nslot - baseslot; + + newchunk = pool_get(&uvm_amap_chunk_pool, PR_NOWAIT | PR_ZERO); + if (newchunk == NULL) + return NULL; + + if (pchunk == NULL) { + TAILQ_INSERT_TAIL(&amap->am_chunks, newchunk, ac_list); + KASSERT(amap->am_buckets[bucket] == NULL); + amap->am_buckets[bucket] = newchunk; + } else + TAILQ_INSERT_AFTER(&amap->am_chunks, pchunk, newchunk, + ac_list); + + amap->am_ncused++; + newchunk->ac_baseslot = baseslot; + newchunk->ac_nslot = n; + return newchunk; +} + +void +amap_chunk_free(struct vm_amap *amap, struct vm_amap_chunk *chunk) +{ + int bucket = UVM_AMAP_BUCKET(amap, chunk->ac_baseslot); + + if (UVM_AMAP_SMALL(amap)) + return; + + TAILQ_REMOVE(&amap->am_chunks, chunk, ac_list); + if (amap->am_buckets[bucket] == chunk) + amap->am_buckets[bucket] = NULL; + pool_put(&uvm_amap_chunk_pool, chunk); + amap->am_ncused--; +} + #ifdef UVM_AMAP_PPREF /* * what is ppref? ppref is an _optional_ amap feature which is used @@ -157,19 +224,28 @@ void amap_init(void) { int i; + size_t size; /* Initialize the vm_amap pool. */ pool_init(&uvm_amap_pool, sizeof(struct vm_amap), 0, 0, PR_WAITOK, "amappl", NULL); pool_sethiwat(&uvm_amap_pool, 4096); - for (i = 0; i < nitems(uvm_amap_slot_pools); i++) { - snprintf(amap_slot_pool_names[i], - sizeof(amap_slot_pool_names[0]), "amapslotpl%d", i + 1); - pool_init(&uvm_amap_slot_pools[i], (i + 1) * MALLOC_SLOT_UNIT, - 0, 0, PR_WAITOK, amap_slot_pool_names[i], NULL); - pool_sethiwat(&uvm_amap_slot_pools[i], 4096); + /* initialize small amap pools */ + for (i = 0; i < nitems(uvm_small_amap_pool); i++) { + snprintf(amap_small_pool_names[i], + sizeof(amap_small_pool_names[0]), "amappl%d", i + 1); + size = offsetof(struct vm_amap, am_small.ac_anon) + + (i + 1) * sizeof(struct vm_anon *); + pool_init(&uvm_small_amap_pool[i], size, 0, 0, PR_WAITOK, + amap_small_pool_names[i], NULL); } + + pool_init(&uvm_amap_chunk_pool, + sizeof(struct vm_amap_chunk) + + UVM_AMAP_CHUNK * sizeof(struct vm_anon *), 0, 0, PR_WAITOK, + "amapchunkpl", NULL); + pool_sethiwat(&uvm_amap_chunk_pool, 4096); } /* @@ -177,12 +253,53 @@ amap_init(void) * init the overlay. */ static inline struct vm_amap * -amap_alloc1(int slots, int waitf) +amap_alloc1(int slots, int waitf, int dynamic) { struct vm_amap *amap; + struct vm_amap_chunk *chunk, *tmp; + int chunks, chunkperbucket = 1, hashshift = 0; + int buckets, i, n; + int pwaitf = (waitf == M_WAITOK) ? PR_WAITOK : PR_NOWAIT; + + chunks = roundup(slots, UVM_AMAP_CHUNK) / UVM_AMAP_CHUNK; - amap = pool_get(&uvm_amap_pool, (waitf == M_WAITOK) ? PR_WAITOK - : PR_NOWAIT); + if (dynamic) { + /* + * Basically, the amap is a hash map where the number of + * buckets is fixed. We select the number of buckets using the + * following strategy: + * + * 1. The maximal number of entries to search in a bucket upon + * a collision should be less than or equal to + * log2(slots / UVM_AMAP_CHUNK). This is the worst-case number + * of lookups we would have if we could chunk the amap. The + * log2(n) comes from the fact that amaps are chunked by + * splitting up their vm_map_entries and organizing those + * in a binary search tree. + * + * 2. The maximal number of entries in a bucket must be a + * power of two. + * + * The maximal number of entries per bucket is used to hash + * a slot to a bucket. + * + * In the future, this strategy could be refined to make it + * even harder/impossible that the total amount of KVA needed + * for the hash buckets of all amaps to exceed the maximal + * amount of KVA memory reserved for amaps. + */ + chunkperbucket = 1 << hashshift; + while ((1 << chunkperbucket) * 2 <= chunks) { + hashshift++; + chunkperbucket = 1 << hashshift; + } + } + + if (slots > UVM_AMAP_CHUNK) + amap = pool_get(&uvm_amap_pool, pwaitf); + else + amap = pool_get(&uvm_small_amap_pool[slots - 1], + pwaitf | PR_ZERO); if (amap == NULL) return(NULL); @@ -196,25 +313,48 @@ amap_alloc1(int slots, int waitf) amap->am_nslot = slots; amap->am_nused = 0; - if (slots > UVM_AMAP_CHUNK) - amap->am_slots = malloc(slots * MALLOC_SLOT_UNIT, - M_UVMAMAP, waitf); - else - amap->am_slots = pool_get( - &uvm_amap_slot_pools[slots - 1], - (waitf == M_WAITOK) ? PR_WAITOK : PR_NOWAIT); + if (slots <= UVM_AMAP_CHUNK) + return (amap); - if (amap->am_slots == NULL) + amap->am_ncused = 0; + TAILQ_INIT(&amap->am_chunks); + amap->am_hashshift = hashshift; + amap->am_buckets = NULL; + + buckets = howmany(chunks, chunkperbucket); + amap->am_buckets = mallocarray(buckets, sizeof(amap->am_buckets), + M_UVMAMAP, waitf | (dynamic ? M_ZERO : 0)); + if (amap->am_buckets == NULL) goto fail1; - amap->am_bckptr = (int *)(((char *)amap->am_slots) + slots * - sizeof(int)); - amap->am_anon = (struct vm_anon **)(((char *)amap->am_bckptr) + - slots * sizeof(int)); + if (!dynamic) { + for (i = 0; i < buckets; i++) { + if (i == buckets - 1) { + n = slots % UVM_AMAP_CHUNK; + if (n == 0) + n = UVM_AMAP_CHUNK; + } else + n = UVM_AMAP_CHUNK; + + chunk = pool_get(&uvm_amap_chunk_pool, + PR_ZERO | pwaitf); + if (chunk == NULL) + goto fail1; + + amap->am_buckets[i] = chunk; + amap->am_ncused++; + chunk->ac_baseslot = i * UVM_AMAP_CHUNK; + chunk->ac_nslot = n; + TAILQ_INSERT_TAIL(&amap->am_chunks, chunk, ac_list); + } + } return(amap); fail1: + free(amap->am_buckets, M_UVMAMAP, 0); + TAILQ_FOREACH_SAFE(chunk, &amap->am_chunks, ac_list, tmp) + pool_put(&uvm_amap_chunk_pool, chunk); pool_put(&uvm_amap_pool, amap); return (NULL); } @@ -226,19 +366,16 @@ fail1: * => reference count to new amap is set to one */ struct vm_amap * -amap_alloc(vaddr_t sz, int waitf) +amap_alloc(vaddr_t sz, int waitf, int dynamic) { struct vm_amap *amap; int slots; AMAP_B2SLOT(slots, sz); /* load slots */ - amap = amap_alloc1(slots, waitf); - if (amap) { - memset(amap->am_anon, 0, - amap->am_nslot * sizeof(struct vm_anon *)); + amap = amap_alloc1(slots, waitf, dynamic); + if (amap) amap_list_insert(amap); - } return(amap); } @@ -252,22 +389,24 @@ amap_alloc(vaddr_t sz, int waitf) void amap_free(struct vm_amap *amap) { + struct vm_amap_chunk *chunk, *tmp; KASSERT(amap->am_ref == 0 && amap->am_nused == 0); KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0); - if (amap->am_nslot > UVM_AMAP_CHUNK) - free(amap->am_slots, M_UVMAMAP, 0); - else - pool_put(&uvm_amap_slot_pools[amap->am_nslot - 1], - amap->am_slots); - #ifdef UVM_AMAP_PPREF if (amap->am_ppref && amap->am_ppref != PPREF_NONE) free(amap->am_ppref, M_UVMAMAP, 0); #endif - pool_put(&uvm_amap_pool, amap); + if (UVM_AMAP_SMALL(amap)) + pool_put(&uvm_small_amap_pool[amap->am_nslot - 1], amap); + else { + TAILQ_FOREACH_SAFE(chunk, &amap->am_chunks, ac_list, tmp) + pool_put(&uvm_amap_chunk_pool, chunk); + free(amap->am_buckets, M_UVMAMAP, 0); + pool_put(&uvm_amap_pool, amap); + } } /* @@ -280,8 +419,9 @@ amap_free(struct vm_amap *amap) void amap_wipeout(struct vm_amap *amap) { - int lcv, slot; + int slot; struct vm_anon *anon; + struct vm_amap_chunk *chunk; KASSERT(amap->am_ref == 0); @@ -291,19 +431,25 @@ amap_wipeout(struct vm_amap *amap) } amap_list_remove(amap); - for (lcv = 0 ; lcv < amap->am_nused ; lcv++) { - int refs; + AMAP_CHUNK_FOREACH(chunk, amap) { + int i, refs, map = chunk->ac_usedmap; - slot = amap->am_slots[lcv]; - anon = amap->am_anon[slot]; + for (i = ffs(map); i != 0; i = ffs(map)) { + slot = i - 1; + map ^= 1 << slot; + anon = chunk->ac_anon[slot]; - if (anon == NULL || anon->an_ref == 0) - panic("amap_wipeout: corrupt amap"); + if (anon == NULL || anon->an_ref == 0) + panic("amap_wipeout: corrupt amap"); - refs = --anon->an_ref; - if (refs == 0) { - /* we had the last reference to a vm_anon. free it. */ - uvm_anfree(anon); + refs = --anon->an_ref; + if (refs == 0) { + /* + * we had the last reference to a vm_anon. + * free it. + */ + uvm_anfree(anon); + } } } @@ -330,8 +476,10 @@ amap_copy(struct vm_map *map, struct vm_ boolean_t canchunk, vaddr_t startva, vaddr_t endva) { struct vm_amap *amap, *srcamap; - int slots, lcv; + int slots, lcv, dynamic = 0; vaddr_t chunksize; + int i, j, k, n, srcslot; + struct vm_amap_chunk *chunk = NULL, *srcchunk = NULL; /* is there a map to copy? if not, create one from scratch. */ if (entry->aref.ar_amap == NULL) { @@ -339,22 +487,29 @@ amap_copy(struct vm_map *map, struct vm_ * check to see if we have a large amap that we can * chunk. we align startva/endva to chunk-sized * boundaries and then clip to them. + * + * if we cannot chunk the amap, allocate it in a way + * that makes it grow or shrink dynamically with + * the number of slots. */ - if (canchunk && atop(entry->end - entry->start) >= - UVM_AMAP_LARGE) { - /* convert slots to bytes */ - chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT; - startva = (startva / chunksize) * chunksize; - endva = roundup(endva, chunksize); - UVM_MAP_CLIP_START(map, entry, startva); - /* watch out for endva wrap-around! */ - if (endva >= startva) - UVM_MAP_CLIP_END(map, entry, endva); + if (atop(entry->end - entry->start) >= UVM_AMAP_LARGE) { + if (canchunk) { + /* convert slots to bytes */ + chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT; + startva = (startva / chunksize) * chunksize; + endva = roundup(endva, chunksize); + UVM_MAP_CLIP_START(map, entry, startva); + /* watch out for endva wrap-around! */ + if (endva >= startva) + UVM_MAP_CLIP_END(map, entry, endva); + } else + dynamic = 1; } entry->aref.ar_pageoff = 0; entry->aref.ar_amap = amap_alloc(entry->end - entry->start, - waitf); + waitf, dynamic); + if (entry->aref.ar_amap != NULL) entry->etype &= ~UVM_ET_NEEDSCOPY; return; @@ -373,7 +528,9 @@ amap_copy(struct vm_map *map, struct vm_ /* looks like we need to copy the map. */ AMAP_B2SLOT(slots, entry->end - entry->start); - amap = amap_alloc1(slots, waitf); + amap = amap_alloc1(slots, waitf, + entry->aref.ar_amap->am_nslot > UVM_AMAP_CHUNK && + entry->aref.ar_amap->am_hashshift != 0); if (amap == NULL) return; srcamap = entry->aref.ar_amap; @@ -392,17 +549,39 @@ amap_copy(struct vm_map *map, struct vm_ } /* we must copy it now. */ - for (lcv = 0 ; lcv < slots; lcv++) { - amap->am_anon[lcv] = - srcamap->am_anon[entry->aref.ar_pageoff + lcv]; - if (amap->am_anon[lcv] == NULL) + for (lcv = 0; lcv < slots; lcv += n) { + srcslot = entry->aref.ar_pageoff + lcv; + i = UVM_AMAP_SLOTIDX(lcv); + j = UVM_AMAP_SLOTIDX(srcslot); + n = UVM_AMAP_CHUNK; + if (i > j) + n -= i; + else + n -= j; + if (lcv + n > slots) + n = slots - lcv; + + srcchunk = amap_chunk_get(srcamap, srcslot, 0); + if (srcchunk == NULL) continue; - amap->am_anon[lcv]->an_ref++; - amap->am_bckptr[lcv] = amap->am_nused; - amap->am_slots[amap->am_nused] = lcv; - amap->am_nused++; + + chunk = amap_chunk_get(amap, lcv, 1); + if (chunk == NULL) { + amap->am_ref = 0; + amap_wipeout(amap); + return; + } + + for (k = 0; k < n; i++, j++, k++) { + chunk->ac_anon[i] = srcchunk->ac_anon[j]; + if (chunk->ac_anon[i] == NULL) + continue; + + chunk->ac_usedmap |= (1 << i); + chunk->ac_anon[i]->an_ref++; + amap->am_nused++; + } } - KASSERT(lcv == amap->am_nslot); /* * drop our reference to the old amap (srcamap). @@ -446,9 +625,10 @@ void amap_cow_now(struct vm_map *map, struct vm_map_entry *entry) { struct vm_amap *amap = entry->aref.ar_amap; - int lcv, slot; + int slot; struct vm_anon *anon, *nanon; struct vm_page *pg, *npg; + struct vm_amap_chunk *chunk; /* * note that if we wait, we must ReStart the "lcv" for loop because @@ -456,22 +636,27 @@ amap_cow_now(struct vm_map *map, struct * am_anon[] array on us. */ ReStart: - for (lcv = 0 ; lcv < amap->am_nused ; lcv++) { - /* get the page */ - slot = amap->am_slots[lcv]; - anon = amap->am_anon[slot]; - pg = anon->an_page; - - /* page must be resident since parent is wired */ - if (pg == NULL) - panic("amap_cow_now: non-resident wired page" - " in anon %p", anon); + AMAP_CHUNK_FOREACH(chunk, amap) { + int i, map = chunk->ac_usedmap; + + for (i = ffs(map); i != 0; i = ffs(map)) { + slot = i - 1; + map ^= 1 << slot; + anon = chunk->ac_anon[slot]; + pg = anon->an_page; + + /* page must be resident since parent is wired */ + if (pg == NULL) + panic("amap_cow_now: non-resident wired page" + " in anon %p", anon); + + /* + * if the anon ref count is one, we are safe (the child + * has exclusive access to the page). + */ + if (anon->an_ref <= 1) + continue; - /* - * if the anon ref count is one, we are safe (the child has - * exclusive access to the page). - */ - if (anon->an_ref > 1) { /* * if the page is busy then we have to wait for * it and then restart. @@ -501,14 +686,14 @@ ReStart: uvm_wait("cownowpage"); goto ReStart; } - + /* * got it... now we can copy the data and replace anon * with our new one... */ uvm_pagecopy(pg, npg); /* old -> new */ anon->an_ref--; /* can't drop to zero */ - amap->am_anon[slot] = nanon; /* replace */ + chunk->ac_anon[slot] = nanon; /* replace */ /* * drop PG_BUSY on new page ... since we have had its @@ -652,61 +837,83 @@ amap_pp_adjref(struct vm_amap *amap, int void amap_wiperange(struct vm_amap *amap, int slotoff, int slots) { - int byanon, lcv, stop, curslot, ptr, slotend; + int curslot, i, map, bybucket; + int bucket, startbucket, endbucket; + int startbase, endbase; struct vm_anon *anon; + struct vm_amap_chunk *chunk, *nchunk; /* - * we can either traverse the amap by am_anon or by am_slots depending - * on which is cheaper. decide now. + * we can either traverse the amap by am_chunks or by am_buckets + * depending on which is cheaper. decide now. */ - if (slots < amap->am_nused) { - byanon = TRUE; - lcv = slotoff; - stop = slotoff + slots; + startbucket = UVM_AMAP_BUCKET(amap, slotoff); + endbucket = UVM_AMAP_BUCKET(amap, slotoff + slots - 1); + + if (UVM_AMAP_SMALL(amap)) { + bybucket = FALSE; + chunk = &amap->am_small; + } else if (endbucket + 1 - startbucket >= amap->am_ncused) { + bybucket = FALSE; + chunk = TAILQ_FIRST(&amap->am_chunks); } else { - byanon = FALSE; - lcv = 0; - stop = amap->am_nused; - slotend = slotoff + slots; + bybucket = TRUE; + bucket = startbucket; + chunk = amap->am_buckets[bucket]; } - while (lcv < stop) { - int refs; - - if (byanon) { - curslot = lcv++; /* lcv advances here */ - if (amap->am_anon[curslot] == NULL) - continue; - } else { - curslot = amap->am_slots[lcv]; - if (curslot < slotoff || curslot >= slotend) { - lcv++; /* lcv advances here */ - continue; - } - stop--; /* drop stop, since anon will be removed */ + startbase = AMAP_BASE_SLOT(slotoff); + endbase = AMAP_BASE_SLOT(slotoff + slots - 1); + for (;;) { + if (chunk == NULL || (bybucket && + UVM_AMAP_BUCKET(amap, chunk->ac_baseslot) != bucket)) { + if (!bybucket || bucket >= endbucket) + break; + bucket++; + chunk = amap->am_buckets[bucket]; + continue; + } else if (!bybucket) { + if (chunk->ac_baseslot + chunk->ac_nslot <= slotoff) + goto next; + if (chunk->ac_baseslot >= slotoff + slots) + goto next; } - anon = amap->am_anon[curslot]; - /* remove it from the amap */ - amap->am_anon[curslot] = NULL; - ptr = amap->am_bckptr[curslot]; - if (ptr != (amap->am_nused - 1)) { - amap->am_slots[ptr] = - amap->am_slots[amap->am_nused - 1]; - amap->am_bckptr[amap->am_slots[ptr]] = - ptr; /* back ptr. */ - } - amap->am_nused--; - - /* drop anon reference count */ - refs = --anon->an_ref; - if (refs == 0) { - /* - * we just eliminated the last reference to an anon. - * free it. - */ - uvm_anfree(anon); + map = chunk->ac_usedmap; + if (startbase == chunk->ac_baseslot) + map &= ~((1 << (slotoff - startbase)) - 1); + if (endbase == chunk->ac_baseslot) + map &= (1 << (slotoff + slots - endbase)) - 1; + + for (i = ffs(map); i != 0; i = ffs(map)) { + int refs; + + curslot = i - 1; + map ^= 1 << curslot; + chunk->ac_usedmap ^= 1 << curslot; + anon = chunk->ac_anon[curslot]; + + /* remove it from the amap */ + chunk->ac_anon[curslot] = NULL; + + amap->am_nused--; + + /* drop anon reference count */ + refs = --anon->an_ref; + if (refs == 0) { + /* + * we just eliminated the last reference to an + * anon. free it. + */ + uvm_anfree(anon); + } } + +next: + nchunk = TAILQ_NEXT(chunk, ac_list); + if (chunk->ac_usedmap == 0) + amap_chunk_free(amap, chunk); + chunk = nchunk; } } @@ -728,31 +935,38 @@ amap_swap_off(int startslot, int endslot boolean_t rv = FALSE; for (am = LIST_FIRST(&amap_list); am != NULL && !rv; am = am_next) { - int i; + int i, map; + struct vm_amap_chunk *chunk; - for (i = 0; i < am->am_nused; i++) { - int slot; - int swslot; - struct vm_anon *anon; - - slot = am->am_slots[i]; - anon = am->am_anon[slot]; - - swslot = anon->an_swslot; - if (swslot < startslot || endslot <= swslot) { - continue; - } +again: + AMAP_CHUNK_FOREACH(chunk, am) { + map = chunk->ac_usedmap; + + for (i = ffs(map); i != 0; i = ffs(map)) { + int swslot; + int slot = i - 1; + struct vm_anon *anon; + + map ^= 1 << slot; + anon = chunk->ac_anon[slot]; + + swslot = anon->an_swslot; + if (swslot < startslot || endslot <= swslot) { + continue; + } - am->am_flags |= AMAP_SWAPOFF; + am->am_flags |= AMAP_SWAPOFF; - rv = uvm_anon_pagein(anon); + rv = uvm_anon_pagein(anon); - am->am_flags &= ~AMAP_SWAPOFF; - if (rv || amap_refs(am) == 0) - break; - i = 0; + am->am_flags &= ~AMAP_SWAPOFF; + if (rv || amap_refs(am) == 0) + goto nextamap; + goto again; + } } +nextamap: am_next = LIST_NEXT(am, am_list); if (amap_refs(am) == 0) amap_wipeout(am); @@ -769,6 +983,7 @@ amap_lookup(struct vm_aref *aref, vaddr_ { int slot; struct vm_amap *amap = aref->ar_amap; + struct vm_amap_chunk *chunk; AMAP_B2SLOT(slot, offset); slot += aref->ar_pageoff; @@ -776,7 +991,11 @@ amap_lookup(struct vm_aref *aref, vaddr_ if (slot >= amap->am_nslot) panic("amap_lookup: offset out of range"); - return(amap->am_anon[slot]); + chunk = amap_chunk_get(amap, slot, 0); + if (chunk == NULL) + return NULL; + + return chunk->ac_anon[UVM_AMAP_SLOTIDX(slot)]; } /* @@ -788,8 +1007,9 @@ void amap_lookups(struct vm_aref *aref, vaddr_t offset, struct vm_anon **anons, int npages) { - int slot; + int i, lcv, n, slot; struct vm_amap *amap = aref->ar_amap; + struct vm_amap_chunk *chunk = NULL; AMAP_B2SLOT(slot, offset); slot += aref->ar_pageoff; @@ -797,49 +1017,65 @@ amap_lookups(struct vm_aref *aref, vaddr if ((slot + (npages - 1)) >= amap->am_nslot) panic("amap_lookups: offset out of range"); - memcpy(anons, &amap->am_anon[slot], npages * sizeof(struct vm_anon *)); - - return; + for (i = 0, lcv = slot; lcv < slot + npages; i += n, lcv += n) { + n = UVM_AMAP_CHUNK - UVM_AMAP_SLOTIDX(lcv); + if (lcv + n > slot + npages) + n = slot + npages - lcv; + + chunk = amap_chunk_get(amap, lcv, 0); + if (chunk == NULL) + memset(&anons[i], 0, n * sizeof(*anons[i])); + else + memcpy(&anons[i], + &chunk->ac_anon[UVM_AMAP_SLOTIDX(lcv)], + n * sizeof(*anons[i])); + } } /* * amap_add: add (or replace) a page to an amap * - * => returns an "offset" which is meaningful to amap_unadd(). + * => returns 0 if adding the page was successful or 1 when not. */ -void +int amap_add(struct vm_aref *aref, vaddr_t offset, struct vm_anon *anon, boolean_t replace) { int slot; struct vm_amap *amap = aref->ar_amap; + struct vm_amap_chunk *chunk; AMAP_B2SLOT(slot, offset); slot += aref->ar_pageoff; if (slot >= amap->am_nslot) panic("amap_add: offset out of range"); + chunk = amap_chunk_get(amap, slot, 1); + if (chunk == NULL) + return 1; + slot = UVM_AMAP_SLOTIDX(slot); if (replace) { - if (amap->am_anon[slot] == NULL) + if (chunk->ac_anon[slot] == NULL) panic("amap_add: replacing null anon"); - if (amap->am_anon[slot]->an_page != NULL && + if (chunk->ac_anon[slot]->an_page != NULL && (amap->am_flags & AMAP_SHARED) != 0) { - pmap_page_protect(amap->am_anon[slot]->an_page, + pmap_page_protect(chunk->ac_anon[slot]->an_page, PROT_NONE); /* * XXX: suppose page is supposed to be wired somewhere? */ } } else { /* !replace */ - if (amap->am_anon[slot] != NULL) + if (chunk->ac_anon[slot] != NULL) panic("amap_add: slot in use"); - amap->am_bckptr[slot] = amap->am_nused; - amap->am_slots[amap->am_nused] = slot; + chunk->ac_usedmap |= 1 << slot; amap->am_nused++; } - amap->am_anon[slot] = anon; + chunk->ac_anon[slot] = anon; + + return 0; } /* @@ -848,26 +1084,29 @@ amap_add(struct vm_aref *aref, vaddr_t o void amap_unadd(struct vm_aref *aref, vaddr_t offset) { - int ptr, slot; + int slot; struct vm_amap *amap = aref->ar_amap; + struct vm_amap_chunk *chunk; AMAP_B2SLOT(slot, offset); slot += aref->ar_pageoff; if (slot >= amap->am_nslot) panic("amap_unadd: offset out of range"); + chunk = amap_chunk_get(amap, slot, 0); + if (chunk == NULL) + panic("amap_unadd: chunk for slot %d not present", slot); - if (amap->am_anon[slot] == NULL) + slot = UVM_AMAP_SLOTIDX(slot); + if (chunk->ac_anon[slot] == NULL) panic("amap_unadd: nothing there"); - amap->am_anon[slot] = NULL; - ptr = amap->am_bckptr[slot]; - - if (ptr != (amap->am_nused - 1)) { /* swap to keep slots contig? */ - amap->am_slots[ptr] = amap->am_slots[amap->am_nused - 1]; - amap->am_bckptr[amap->am_slots[ptr]] = ptr; /* back link */ - } + chunk->ac_anon[slot] = NULL; + chunk->ac_usedmap &= ~(1 << slot); amap->am_nused--; + + if (chunk->ac_usedmap == 0) + amap_chunk_free(amap, chunk); } /* Index: sys/uvm/uvm_amap.h =================================================================== RCS file: /cvs/src/sys/uvm/uvm_amap.h,v retrieving revision 1.24 diff -u -p -r1.24 uvm_amap.h --- sys/uvm/uvm_amap.h 16 Apr 2016 18:39:31 -0000 1.24 +++ sys/uvm/uvm_amap.h 22 Apr 2016 15:49:58 -0000 @@ -61,12 +61,11 @@ struct vm_amap; /* * prototypes for the amap interface */ - /* add an anon to an amap */ -void amap_add(struct vm_aref *, vaddr_t, struct vm_anon *, +int amap_add(struct vm_aref *, vaddr_t, struct vm_anon *, boolean_t); /* allocate a new amap */ -struct vm_amap *amap_alloc(vaddr_t, int); +struct vm_amap *amap_alloc(vaddr_t, int, int); /* clear amap needs-copy flag */ void amap_copy(vm_map_t, vm_map_entry_t, int, boolean_t, vaddr_t, vaddr_t); @@ -110,57 +109,78 @@ boolean_t amap_swap_off(int, int); /* * we currently provide an array-based amap implementation. in this - * implementation we provide the option of tracking split references - * so that we don't lose track of references during partial unmaps - * ... this is enabled with the "UVM_AMAP_PPREF" define. + * implementation we track split references so that we don't lose + * track of references during partial unmaps */ #define UVM_AMAP_PPREF /* track partial references */ /* - * here is the definition of the vm_amap structure for this implementation. + * here is the definition of the vm_amap structure and helper structures for + * this implementation. */ +struct vm_amap_chunk { + TAILQ_ENTRY(vm_amap_chunk) ac_list; + int ac_baseslot; + uint16_t ac_usedmap; + uint16_t ac_nslot; + struct vm_anon *ac_anon[]; +}; + struct vm_amap { int am_ref; /* reference count */ int am_flags; /* flags */ int am_nslot; /* # of slots currently in map */ int am_nused; /* # of slots currently in use */ - int *am_slots; /* contig array of active slots */ - int *am_bckptr; /* back pointer array to am_slots */ - struct vm_anon **am_anon; /* array of anonymous pages */ #ifdef UVM_AMAP_PPREF int *am_ppref; /* per page reference count (if !NULL) */ #endif LIST_ENTRY(vm_amap) am_list; + + union { + struct { + struct vm_amap_chunk **amn_buckets; + TAILQ_HEAD(, vm_amap_chunk) amn_chunks; + int amn_ncused; /* # of chunkers currently in use */ + int amn_hashshift; /* shift count to hash slot to bucket */ + } ami_normal; + + /* + * MUST be last element in vm_amap because it contains a + * variably sized array element. + */ + struct vm_amap_chunk ami_small; + } am_impl; + +#define am_buckets am_impl.ami_normal.amn_buckets +#define am_chunks am_impl.ami_normal.amn_chunks +#define am_ncused am_impl.ami_normal.amn_ncused +#define am_hashshift am_impl.ami_normal.amn_hashshift + +#define am_small am_impl.ami_small }; /* - * note that am_slots, am_bckptr, and am_anon are arrays. this allows - * fast lookup of pages based on their virual address at the expense of - * some extra memory. in the future we should be smarter about memory - * usage and fall back to a non-array based implementation on systems - * that are short of memory (XXXCDC). - * - * the entries in the array are called slots... for example an amap that - * covers four pages of virtual memory is said to have four slots. here - * is an example of the array usage for a four slot amap. note that only - * slots one and three have anons assigned to them. "D/C" means that we - * "don't care" about the value. - * - * 0 1 2 3 - * am_anon: NULL, anon0, NULL, anon1 (actual pointers to anons) - * am_bckptr: D/C, 1, D/C, 0 (points to am_slots entry) + * The entries in an amap are called slots. For example an amap that + * covers four pages is said to have four slots. * - * am_slots: 3, 1, D/C, D/C (says slots 3 and 1 are in use) - * - * note that am_bckptr is D/C if the slot in am_anon is set to NULL. - * to find the entry in am_slots for an anon, look at am_bckptr[slot], - * thus the entry for slot 3 in am_slots[] is at am_slots[am_bckptr[3]]. - * in general, if am_anon[X] is non-NULL, then the following must be - * true: am_slots[am_bckptr[X]] == X + * The slots of an amap are clustered into chunks of UVM_AMAP_CHUNK + * slots each. The data structure of a chunk is vm_amap_chunk. + * Every chunk contains an array of pointers to vm_anon, and a bitmap + * is used to represent which of the slots are in use. + * + * Small amaps of up to UVM_AMAP_CHUNK slots have the chunk directly + * embedded in the amap structure. + * + * amaps with more slots are normal amaps and organize chunks in a hash + * table. The hash table is organized as an array of buckets. + * All chunks of the amap are additionally stored in a linked list. + * Chunks that belong to the same hash bucket are stored in the list + * consecutively. When all slots in a chunk are unused, the chunk is freed. * - * note that am_slots is always contig-packed. + * For large amaps, the bucket array can grow large. See the description + * below how large bucket arrays are avoided. */ /* @@ -203,6 +223,11 @@ struct vm_amap { #define UVM_AMAP_LARGE 256 /* # of slots in "large" amap */ #define UVM_AMAP_CHUNK 16 /* # of slots to chunk large amaps in */ +#define UVM_AMAP_SMALL(amap) ((amap)->am_nslot <= UVM_AMAP_CHUNK) +#define UVM_AMAP_SLOTIDX(slot) ((slot) % UVM_AMAP_CHUNK) +#define UVM_AMAP_BUCKET(amap, slot) \ + (((slot) / UVM_AMAP_CHUNK) >> (amap)->am_hashshift) + #ifdef _KERNEL /* @@ -214,6 +239,14 @@ struct vm_amap { KASSERT(((B) & (PAGE_SIZE - 1)) == 0); \ (S) = (B) >> PAGE_SHIFT; \ } + +#define AMAP_CHUNK_FOREACH(chunk, amap) \ + for (chunk = (UVM_AMAP_SMALL(amap) ? \ + &(amap)->am_small : TAILQ_FIRST(&(amap)->am_chunks)); \ + (chunk) != NULL; (chunk) = TAILQ_NEXT(chunk, ac_list)) + +#define AMAP_BASE_SLOT(slot) \ + (((slot) / UVM_AMAP_CHUNK) * UVM_AMAP_CHUNK) /* * flags macros Index: sys/uvm/uvm_fault.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_fault.c,v retrieving revision 1.89 diff -u -p -r1.89 uvm_fault.c --- sys/uvm/uvm_fault.c 29 Mar 2016 12:04:26 -0000 1.89 +++ sys/uvm/uvm_fault.c 22 Apr 2016 15:49:58 -0000 @@ -902,8 +902,13 @@ ReFault: /* un-busy! new page */ atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_FAKE); UVM_PAGE_OWN(pg, NULL); - amap_add(&ufi.entry->aref, ufi.orig_rvaddr - ufi.entry->start, - anon, 1); + if (amap_add(&ufi.entry->aref, + ufi.orig_rvaddr - ufi.entry->start, anon, 1)) { + uvm_anfree(anon); + uvmfault_unlockall(&ufi, amap, NULL, oanon); + uvmexp.fltnoamap++; + return (ENOMEM); + } /* deref: can not drop to zero here by defn! */ oanon->an_ref--; @@ -1175,8 +1180,13 @@ Case2: */ } - amap_add(&ufi.entry->aref, ufi.orig_rvaddr - ufi.entry->start, - anon, 0); + if (amap_add(&ufi.entry->aref, + ufi.orig_rvaddr - ufi.entry->start, anon, 0)) { + uvm_anfree(anon); + uvmfault_unlockall(&ufi, amap, NULL, oanon); + uvmexp.fltnoamap++; + return (ENOMEM); + } } /* note: pg is either the uobjpage or the new page in the new anon */ Index: sys/uvm/uvm_map.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_map.c,v retrieving revision 1.211 diff -u -p -r1.211 uvm_map.c --- sys/uvm/uvm_map.c 4 Apr 2016 16:34:16 -0000 1.211 +++ sys/uvm/uvm_map.c 22 Apr 2016 15:49:58 -0000 @@ -1084,7 +1084,7 @@ uvm_mapanon(struct vm_map *map, vaddr_t if (flags & UVM_FLAG_OVERLAY) { KERNEL_LOCK(); entry->aref.ar_pageoff = 0; - entry->aref.ar_amap = amap_alloc(sz, M_WAITOK); + entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0); KERNEL_UNLOCK(); } @@ -1340,7 +1340,7 @@ uvm_map(struct vm_map *map, vaddr_t *add } if (flags & UVM_FLAG_OVERLAY) { entry->aref.ar_pageoff = 0; - entry->aref.ar_amap = amap_alloc(sz, M_WAITOK); + entry->aref.ar_amap = amap_alloc(sz, M_WAITOK, 0); } /* Update map and process statistics. */ Index: sys/uvm/uvm_stat.c =================================================================== RCS file: /cvs/src/sys/uvm/uvm_stat.c,v retrieving revision 1.28 diff -u -p -r1.28 uvm_stat.c --- sys/uvm/uvm_stat.c 25 Oct 2014 12:54:16 -0000 1.28 +++ sys/uvm/uvm_stat.c 22 Apr 2016 15:49:58 -0000 @@ -69,9 +69,9 @@ uvmexp_print(int (*pr)(const char *, ... uvmexp.softs, uvmexp.syscalls, uvmexp.kmapent); (*pr)(" fault counts:\n"); - (*pr)(" noram=%d, noanon=%d, pgwait=%d, pgrele=%d\n", - uvmexp.fltnoram, uvmexp.fltnoanon, uvmexp.fltpgwait, - uvmexp.fltpgrele); + (*pr)(" noram=%d, noanon=%d, noamap=%d, pgwait=%d, pgrele=%d\n", + uvmexp.fltnoram, uvmexp.fltnoanon, uvmexp.fltnoamap, + uvmexp.fltpgwait, uvmexp.fltpgrele); (*pr)(" ok relocks(total)=%d(%d), anget(retries)=%d(%d), " "amapcopy=%d\n", uvmexp.fltrelckok, uvmexp.fltrelck, uvmexp.fltanget, uvmexp.fltanretry, uvmexp.fltamcopy); Index: sys/uvm/uvmexp.h =================================================================== RCS file: /cvs/src/sys/uvm/uvmexp.h,v retrieving revision 1.1 diff -u -p -r1.1 uvmexp.h --- sys/uvm/uvmexp.h 8 Jul 2014 17:19:26 -0000 1.1 +++ sys/uvm/uvmexp.h 22 Apr 2016 15:49:59 -0000 @@ -107,6 +107,7 @@ struct uvmexp { /* fault subcounters */ int fltnoram; /* number of times fault was out of ram */ int fltnoanon; /* number of times fault was out of anons */ + int fltnoamap; /* number of times fault was out of amap chunks */ int fltpgwait; /* number of times fault had to wait on a page */ int fltpgrele; /* number of times fault found a released page */ int fltrelck; /* number of times fault relock called */