Author: attilio
Date: Fri Aug  9 11:28:55 2013
New Revision: 254141
URL: http://svnweb.freebsd.org/changeset/base/254141

Log:
  On all the architectures, avoid to preallocate the physical memory
  for nodes used in vm_radix.
  On architectures supporting direct mapping, also avoid to pre-allocate
  the KVA for such nodes.
  
  In order to do so make the operations derived from vm_radix_insert()
  to fail and handle all the deriving failure of those.
  
  vm_radix-wise introduce a new function called vm_radix_replace(),
  which can replace a leaf node, already present, with a new one,
  and take into account the possibility, during vm_radix_insert()
  allocation, that the operations on the radix trie can recurse.
  This means that if operations in vm_radix_insert() recursed
  vm_radix_insert() will start from scratch again.
  
  Sponsored by: EMC / Isilon storage division
  Reviewed by:  alc (older version)
  Reviewed by:  jeff
  Tested by:    pho, scottl

Modified:
  head/sys/amd64/amd64/pmap.c
  head/sys/dev/drm2/i915/i915_gem.c
  head/sys/dev/drm2/ttm/ttm_bo_vm.c
  head/sys/i386/i386/pmap.c
  head/sys/kern/subr_uio.c
  head/sys/vm/_vm_radix.h
  head/sys/vm/device_pager.c
  head/sys/vm/sg_pager.c
  head/sys/vm/vm_fault.c
  head/sys/vm/vm_object.c
  head/sys/vm/vm_object.h
  head/sys/vm/vm_page.c
  head/sys/vm/vm_page.h
  head/sys/vm/vm_radix.c
  head/sys/vm/vm_radix.h

Modified: head/sys/amd64/amd64/pmap.c
==============================================================================
--- head/sys/amd64/amd64/pmap.c Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/amd64/amd64/pmap.c Fri Aug  9 11:28:55 2013        (r254141)
@@ -283,7 +283,7 @@ static boolean_t pmap_enter_pde(pmap_t p
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
-static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
+static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
@@ -1526,12 +1526,12 @@ pmap_add_delayed_free_list(vm_page_t m, 
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  */
-static __inline void
+static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 {
 
        PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-       vm_radix_insert(&pmap->pm_root, mpte);
+       return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
@@ -3439,7 +3439,13 @@ setpte:
            ("pmap_promote_pde: page table page is out of range"));
        KASSERT(mpte->pindex == pmap_pde_pindex(va),
            ("pmap_promote_pde: page table page's pindex is wrong"));
-       pmap_insert_pt_page(pmap, mpte);
+       if (pmap_insert_pt_page(pmap, mpte)) {
+               atomic_add_long(&pmap_pde_p_failures, 1);
+               CTR2(KTR_PMAP,
+                   "pmap_promote_pde: failure for va %#lx in pmap %p", va,
+                   pmap);
+               return;
+       }
 
        /*
         * Promote the pv entries.

Modified: head/sys/dev/drm2/i915/i915_gem.c
==============================================================================
--- head/sys/dev/drm2/i915/i915_gem.c   Fri Aug  9 11:26:26 2013        
(r254140)
+++ head/sys/dev/drm2/i915/i915_gem.c   Fri Aug  9 11:28:55 2013        
(r254141)
@@ -64,6 +64,9 @@ __FBSDID("$FreeBSD$");
 #include <sys/sched.h>
 #include <sys/sf_buf.h>
 
+#include <vm/vm.h>
+#include <vm/vm_pageout.h>
+
 static void i915_gem_object_flush_cpu_write_domain(
     struct drm_i915_gem_object *obj);
 static uint32_t i915_gem_get_gtt_size(struct drm_device *dev, uint32_t size,
@@ -1443,8 +1446,14 @@ retry:
                vm_page_busy_sleep(m, "915pbs");
                goto retry;
        }
+       if (vm_page_insert(m, vm_obj, OFF_TO_IDX(offset))) {
+               DRM_UNLOCK(dev);
+               VM_OBJECT_WUNLOCK(vm_obj);
+               VM_WAIT;
+               VM_OBJECT_WLOCK(vm_obj);
+               goto retry;
+       }
        m->valid = VM_PAGE_BITS_ALL;
-       vm_page_insert(m, vm_obj, OFF_TO_IDX(offset));
 have_page:
        *mres = m;
        vm_page_xbusy(m);

Modified: head/sys/dev/drm2/ttm/ttm_bo_vm.c
==============================================================================
--- head/sys/dev/drm2/ttm/ttm_bo_vm.c   Fri Aug  9 11:26:26 2013        
(r254140)
+++ head/sys/dev/drm2/ttm/ttm_bo_vm.c   Fri Aug  9 11:28:55 2013        
(r254141)
@@ -47,6 +47,7 @@ __FBSDID("$FreeBSD$");
 
 #include <vm/vm.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 
 #define TTM_BO_VM_NUM_PREFAULT 16
 
@@ -221,16 +222,23 @@ reserve:
                ttm_bo_unreserve(bo);
                goto retry;
        }
-       m->valid = VM_PAGE_BITS_ALL;
-       *mres = m;
        m1 = vm_page_lookup(vm_obj, OFF_TO_IDX(offset));
        if (m1 == NULL) {
-               vm_page_insert(m, vm_obj, OFF_TO_IDX(offset));
+               if (vm_page_insert(m, vm_obj, OFF_TO_IDX(offset))) {
+                       VM_OBJECT_WUNLOCK(vm_obj);
+                       VM_WAIT;
+                       VM_OBJECT_WLOCK(vm_obj);
+                       ttm_mem_io_unlock(man);
+                       ttm_bo_unreserve(bo);
+                       goto retry;
+               }
        } else {
                KASSERT(m == m1,
                    ("inconsistent insert bo %p m %p m1 %p offset %jx",
                    bo, m, m1, (uintmax_t)offset));
        }
+       m->valid = VM_PAGE_BITS_ALL;
+       *mres = m;
        vm_page_xbusy(m);
 
        if (oldm != NULL) {

Modified: head/sys/i386/i386/pmap.c
==============================================================================
--- head/sys/i386/i386/pmap.c   Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/i386/i386/pmap.c   Fri Aug  9 11:28:55 2013        (r254141)
@@ -304,7 +304,7 @@ static boolean_t pmap_enter_pde(pmap_t p
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
 static void pmap_flush_page(vm_page_t m);
-static void pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
+static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
@@ -1604,12 +1604,12 @@ pmap_add_delayed_free_list(vm_page_t m, 
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  */
-static __inline void
+static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte)
 {
 
        PMAP_LOCK_ASSERT(pmap, MA_OWNED);
-       vm_radix_insert(&pmap->pm_root, mpte);
+       return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
@@ -3401,7 +3401,13 @@ setpte:
            ("pmap_promote_pde: page table page is out of range"));
        KASSERT(mpte->pindex == va >> PDRSHIFT,
            ("pmap_promote_pde: page table page's pindex is wrong"));
-       pmap_insert_pt_page(pmap, mpte);
+       if (pmap_insert_pt_page(pmap, mpte)) {
+               pmap_pde_p_failures++;
+               CTR2(KTR_PMAP,
+                   "pmap_promote_pde: failure for va %#x in pmap %p", va,
+                   pmap);
+               return;
+       }
 
        /*
         * Promote the pv entries.

Modified: head/sys/kern/subr_uio.c
==============================================================================
--- head/sys/kern/subr_uio.c    Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/kern/subr_uio.c    Fri Aug  9 11:28:55 2013        (r254141)
@@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/vm_param.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_page.h>
+#include <vm/vm_pageout.h>
 #include <vm/vm_map.h>
 #ifdef SOCKET_SEND_COW
 #include <vm/vm_object.h>
@@ -122,7 +123,12 @@ retry:
                if (uobject->backing_object != NULL)
                        pmap_remove(map->pmap, uaddr, uaddr + PAGE_SIZE);
        }
-       vm_page_insert(kern_pg, uobject, upindex);
+       if (vm_page_insert(kern_pg, uobject, upindex)) {
+               VM_OBJECT_WUNLOCK(uobject);
+               VM_WAIT;
+               VM_OBJECT_WLOCK(uobject);
+               goto retry;
+       }
        vm_page_dirty(kern_pg);
        VM_OBJECT_WUNLOCK(uobject);
        vm_map_lookup_done(map, entry);

Modified: head/sys/vm/_vm_radix.h
==============================================================================
--- head/sys/vm/_vm_radix.h     Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/vm/_vm_radix.h     Fri Aug  9 11:28:55 2013        (r254141)
@@ -36,8 +36,12 @@
  */
 struct vm_radix {
        uintptr_t       rt_root;
+       uint8_t         rt_flags;
 };
 
+#define        RT_INSERT_INPROG        0x01
+#define        RT_TRIE_MODIFIED        0x02
+
 #ifdef _KERNEL
 
 static __inline boolean_t

Modified: head/sys/vm/device_pager.c
==============================================================================
--- head/sys/vm/device_pager.c  Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/vm/device_pager.c  Fri Aug  9 11:28:55 2013        (r254141)
@@ -348,11 +348,12 @@ old_dev_pager_fault(vm_object_t object, 
                 */
                page = vm_page_getfake(paddr, memattr);
                VM_OBJECT_WLOCK(object);
+               if (vm_page_replace(page, object, (*mres)->pindex) != *mres)
+                       panic("old_dev_pager_fault: invalid page replacement");
                vm_page_lock(*mres);
                vm_page_free(*mres);
                vm_page_unlock(*mres);
                *mres = page;
-               vm_page_insert(page, object, pidx);
        }
        page->valid = VM_PAGE_BITS_ALL;
        return (VM_PAGER_OK);

Modified: head/sys/vm/sg_pager.c
==============================================================================
--- head/sys/vm/sg_pager.c      Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/vm/sg_pager.c      Fri Aug  9 11:28:55 2013        (r254141)
@@ -186,11 +186,13 @@ sg_pager_getpages(vm_object_t object, vm
 
        /* Free the original pages and insert this fake page into the object. */
        for (i = 0; i < count; i++) {
+               if (i == reqpage &&
+                   vm_page_replace(page, object, offset) != m[i])
+                       panic("sg_pager_getpages: invalid place replacement");
                vm_page_lock(m[i]);
                vm_page_free(m[i]);
                vm_page_unlock(m[i]);
        }
-       vm_page_insert(page, object, offset);
        m[reqpage] = page;
        page->valid = VM_PAGE_BITS_ALL;
 

Modified: head/sys/vm/vm_fault.c
==============================================================================
--- head/sys/vm/vm_fault.c      Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/vm/vm_fault.c      Fri Aug  9 11:28:55 2013        (r254141)
@@ -752,9 +752,11 @@ vnode_locked:
                                 * process'es object.  The page is 
                                 * automatically made dirty.
                                 */
-                               vm_page_lock(fs.m);
-                               vm_page_rename(fs.m, fs.first_object, 
fs.first_pindex);
-                               vm_page_unlock(fs.m);
+                               if (vm_page_rename(fs.m, fs.first_object,
+                                   fs.first_pindex)) {
+                                       unlock_and_deallocate(&fs);
+                                       goto RetryFault;
+                               }
                                vm_page_xbusy(fs.m);
                                fs.first_m = fs.m;
                                fs.m = NULL;

Modified: head/sys/vm/vm_object.c
==============================================================================
--- head/sys/vm/vm_object.c     Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/vm/vm_object.c     Fri Aug  9 11:28:55 2013        (r254141)
@@ -201,10 +201,12 @@ vm_object_zinit(void *mem, int size, int
 
        /* These are true for any object that has been freed */
        object->rtree.rt_root = 0;
+       object->rtree.rt_flags = 0;
        object->paging_in_progress = 0;
        object->resident_page_count = 0;
        object->shadow_count = 0;
        object->cache.rt_root = 0;
+       object->cache.rt_flags = 0;
        return (0);
 }
 
@@ -1351,6 +1353,16 @@ retry:
                        VM_OBJECT_WLOCK(new_object);
                        goto retry;
                }
+
+               /* vm_page_rename() will handle dirty and cache. */
+               if (vm_page_rename(m, new_object, idx)) {
+                       VM_OBJECT_WUNLOCK(new_object);
+                       VM_OBJECT_WUNLOCK(orig_object);
+                       VM_WAIT;
+                       VM_OBJECT_WLOCK(orig_object);
+                       VM_OBJECT_WLOCK(new_object);
+                       goto retry;
+               }
 #if VM_NRESERVLEVEL > 0
                /*
                 * If some of the reservation's allocated pages remain with
@@ -1366,10 +1378,6 @@ retry:
                 */
                vm_reserv_rename(m, new_object, orig_object, offidxstart);
 #endif
-               vm_page_lock(m);
-               vm_page_rename(m, new_object, idx);
-               vm_page_unlock(m);
-               /* page automatically made dirty by rename and cache handled */
                if (orig_object->type == OBJT_SWAP)
                        vm_page_xbusy(m);
        }
@@ -1525,21 +1533,14 @@ vm_object_backing_scan(vm_object_t objec
                            ("vm_object_backing_scan: object mismatch")
                        );
 
-                       /*
-                        * Destroy any associated swap
-                        */
-                       if (backing_object->type == OBJT_SWAP) {
-                               swap_pager_freespace(
-                                   backing_object, 
-                                   p->pindex,
-                                   1
-                               );
-                       }
-
                        if (
                            p->pindex < backing_offset_index ||
                            new_pindex >= object->size
                        ) {
+                               if (backing_object->type == OBJT_SWAP)
+                                       swap_pager_freespace(backing_object, 
+                                           p->pindex, 1);
+
                                /*
                                 * Page is out of the parent object's range, we 
                                 * can simply destroy it. 
@@ -1561,6 +1562,10 @@ vm_object_backing_scan(vm_object_t objec
                            (op & OBSC_COLLAPSE_NOWAIT) != 0 &&
                            (pp != NULL && pp->valid == 0)
                        ) {
+                               if (backing_object->type == OBJT_SWAP)
+                                       swap_pager_freespace(backing_object, 
+                                           p->pindex, 1);
+
                                /*
                                 * The page in the parent is not (yet) valid.
                                 * We don't know anything about the state of
@@ -1579,6 +1584,10 @@ vm_object_backing_scan(vm_object_t objec
                            pp != NULL ||
                            vm_pager_has_page(object, new_pindex, NULL, NULL)
                        ) {
+                               if (backing_object->type == OBJT_SWAP)
+                                       swap_pager_freespace(backing_object, 
+                                           p->pindex, 1);
+
                                /*
                                 * page already exists in parent OR swap exists
                                 * for this location in the parent.  Destroy 
@@ -1598,25 +1607,38 @@ vm_object_backing_scan(vm_object_t objec
                                continue;
                        }
 
-#if VM_NRESERVLEVEL > 0
-                       /*
-                        * Rename the reservation.
-                        */
-                       vm_reserv_rename(p, object, backing_object,
-                           backing_offset_index);
-#endif
-
                        /*
                         * Page does not exist in parent, rename the
                         * page from the backing object to the main object. 
                         *
                         * If the page was mapped to a process, it can remain 
                         * mapped through the rename.
+                        * vm_page_rename() will handle dirty and cache.
+                        */
+                       if (vm_page_rename(p, object, new_pindex)) {
+                               if (op & OBSC_COLLAPSE_NOWAIT) {
+                                       p = next;
+                                       continue;
+                               }
+                               VM_OBJECT_WLOCK(backing_object);
+                               VM_OBJECT_WUNLOCK(object);
+                               VM_WAIT;
+                               VM_OBJECT_WLOCK(object);
+                               VM_OBJECT_WLOCK(backing_object);
+                               p = TAILQ_FIRST(&backing_object->memq);
+                               continue;
+                       }
+                       if (backing_object->type == OBJT_SWAP)
+                               swap_pager_freespace(backing_object, p->pindex,
+                                   1);
+
+#if VM_NRESERVLEVEL > 0
+                       /*
+                        * Rename the reservation.
                         */
-                       vm_page_lock(p);
-                       vm_page_rename(p, object, new_pindex);
-                       vm_page_unlock(p);
-                       /* page automatically made dirty by rename */
+                       vm_reserv_rename(p, object, backing_object,
+                           backing_offset_index);
+#endif
                }
                p = next;
        }

Modified: head/sys/vm/vm_object.h
==============================================================================
--- head/sys/vm/vm_object.h     Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/vm/vm_object.h     Fri Aug  9 11:28:55 2013        (r254141)
@@ -102,7 +102,7 @@ struct vm_object {
        TAILQ_ENTRY(vm_object) object_list; /* list of all objects */
        LIST_HEAD(, vm_object) shadow_head; /* objects that this is a shadow 
for */
        LIST_ENTRY(vm_object) shadow_list; /* chain of shadow objects */
-       TAILQ_HEAD(, vm_page) memq;     /* list of resident pages */
+       TAILQ_HEAD(respgs, vm_page) memq; /* list of resident pages */
        struct vm_radix rtree;          /* root of the resident page radix 
trie*/
        vm_pindex_t size;               /* Object size */
        int generation;                 /* generation ID */

Modified: head/sys/vm/vm_page.c
==============================================================================
--- head/sys/vm/vm_page.c       Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/vm/vm_page.c       Fri Aug  9 11:28:55 2013        (r254141)
@@ -145,11 +145,14 @@ SYSCTL_INT(_vm, OID_AUTO, tryrelock_rest
 static uma_zone_t fakepg_zone;
 
 static struct vnode *vm_page_alloc_init(vm_page_t m);
+static void vm_page_cache_turn_free(vm_page_t m);
 static void vm_page_clear_dirty_mask(vm_page_t m, vm_page_bits_t pagebits);
 static void vm_page_enqueue(int queue, vm_page_t m);
 static void vm_page_init_fakepg(void *dummy);
-static void vm_page_insert_after(vm_page_t m, vm_object_t object,
+static int vm_page_insert_after(vm_page_t m, vm_object_t object,
     vm_pindex_t pindex, vm_page_t mpred);
+static void vm_page_insert_radixdone(vm_page_t m, vm_object_t object,
+    vm_page_t mpred);
 
 SYSINIT(vm_page, SI_SUB_VM, SI_ORDER_SECOND, vm_page_init_fakepg, NULL);
 
@@ -930,14 +933,14 @@ vm_page_dirty_KBI(vm_page_t m)
  *
  *     The object must be locked.
  */
-void
+int
 vm_page_insert(vm_page_t m, vm_object_t object, vm_pindex_t pindex)
 {
        vm_page_t mpred;
 
        VM_OBJECT_ASSERT_WLOCKED(object);
        mpred = vm_radix_lookup_le(&object->rtree, pindex);
-       vm_page_insert_after(m, object, pindex, mpred);
+       return (vm_page_insert_after(m, object, pindex, mpred));
 }
 
 /*
@@ -950,10 +953,12 @@ vm_page_insert(vm_page_t m, vm_object_t 
  *
  *     The object must be locked.
  */
-static void
+static int
 vm_page_insert_after(vm_page_t m, vm_object_t object, vm_pindex_t pindex,
     vm_page_t mpred)
 {
+       vm_pindex_t sidx;
+       vm_object_t sobj;
        vm_page_t msucc;
 
        VM_OBJECT_ASSERT_WLOCKED(object);
@@ -975,17 +980,53 @@ vm_page_insert_after(vm_page_t m, vm_obj
        /*
         * Record the object/offset pair in this page
         */
+       sobj = m->object;
+       sidx = m->pindex;
        m->object = object;
        m->pindex = pindex;
 
        /*
         * Now link into the object's ordered list of backed pages.
         */
+       if (vm_radix_insert(&object->rtree, m)) {
+               m->object = sobj;
+               m->pindex = sidx;
+               return (1);
+       }
+       vm_page_insert_radixdone(m, object, mpred);
+       return (0);
+}
+
+/*
+ *     vm_page_insert_radixdone:
+ *
+ *     Complete page "m" insertion into the specified object after the
+ *     radix trie hooking.
+ *
+ *     The page "mpred" must precede the offset "m->pindex" within the
+ *     specified object.
+ *
+ *     The object must be locked.
+ */
+static void
+vm_page_insert_radixdone(vm_page_t m, vm_object_t object, vm_page_t mpred)
+{
+
+       VM_OBJECT_ASSERT_WLOCKED(object);
+       KASSERT(object != NULL && m->object == object,
+           ("vm_page_insert_radixdone: page %p has inconsistent object", m));
+       if (mpred != NULL) {
+               KASSERT(mpred->object == object ||
+                   (mpred->flags & PG_SLAB) != 0,
+                   ("vm_page_insert_after: object doesn't contain mpred"));
+               KASSERT(mpred->pindex < m->pindex,
+                   ("vm_page_insert_after: mpred doesn't precede pindex"));
+       }
+
        if (mpred != NULL)
                TAILQ_INSERT_AFTER(&object->memq, mpred, m, listq);
        else
                TAILQ_INSERT_HEAD(&object->memq, m, listq);
-       vm_radix_insert(&object->rtree, m);
 
        /*
         * Show that the object has one more resident page.
@@ -1131,6 +1172,54 @@ vm_page_prev(vm_page_t m)
 }
 
 /*
+ * Uses the page mnew as a replacement for an existing page at index
+ * pindex which must be already present in the object.
+ */
+vm_page_t
+vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex)
+{
+       vm_page_t mold, mpred;
+
+       VM_OBJECT_ASSERT_WLOCKED(object);
+
+       /*
+        * This function mostly follows vm_page_insert() and
+        * vm_page_remove() without the radix, object count and vnode
+        * dance.  Double check such functions for more comments.
+        */
+       mpred = vm_radix_lookup(&object->rtree, pindex);
+       KASSERT(mpred != NULL,
+           ("vm_page_replace: replacing page not present with pindex"));
+       mpred = TAILQ_PREV(mpred, respgs, listq);
+       if (mpred != NULL)
+               KASSERT(mpred->pindex < pindex,
+                   ("vm_page_insert_after: mpred doesn't precede pindex"));
+
+       mnew->object = object;
+       mnew->pindex = pindex;
+       mold = vm_radix_replace(&object->rtree, mnew, pindex);
+
+       /* Detach the old page from the resident tailq. */
+       TAILQ_REMOVE(&object->memq, mold, listq);
+       vm_page_lock(mold);
+       if (mold->oflags & VPO_BUSY) {
+               mold->oflags &= ~VPO_BUSY;
+               vm_page_flash(mold);
+       }
+       mold->object = NULL;
+       vm_page_unlock(mold);
+
+       /* Insert the new page in the resident tailq. */
+       if (mpred != NULL)
+               TAILQ_INSERT_AFTER(&object->memq, mpred, mnew, listq);
+       else
+               TAILQ_INSERT_HEAD(&object->memq, mnew, listq);
+       if (pmap_page_is_write_mapped(mnew))
+               vm_object_set_writeable_dirty(object);
+       return (mold);
+}
+
+/*
  *     vm_page_rename:
  *
  *     Move the given memory entry from its
@@ -1148,15 +1237,47 @@ vm_page_prev(vm_page_t m)
  *           or vm_page_dirty() will panic.  Dirty pages are not allowed
  *           on the cache.
  *
- *     The objects must be locked.  The page must be locked if it is managed.
+ *     The objects must be locked.
  */
-void
+int
 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 {
+       vm_page_t mpred;
+       vm_pindex_t opidx;
+
+       VM_OBJECT_ASSERT_WLOCKED(new_object);
+
+       mpred = vm_radix_lookup_le(&new_object->rtree, new_pindex);
+       KASSERT(mpred == NULL || mpred->pindex != new_pindex,
+           ("vm_page_rename: pindex already renamed"));
+
+       /*
+        * Create a custom version of vm_page_insert() which does not depend
+        * by m_prev and can cheat on the implementation aspects of the
+        * function.
+        */
+       opidx = m->pindex;
+       m->pindex = new_pindex;
+       if (vm_radix_insert(&new_object->rtree, m)) {
+               m->pindex = opidx;
+               return (1);
+       }
 
+       /*
+        * The operation cannot fail anymore.  The removal must happen before
+        * the listq iterator is tainted.
+        */
+       m->pindex = opidx;
+       vm_page_lock(m);
        vm_page_remove(m);
-       vm_page_insert(m, new_object, new_pindex);
+
+       /* Return back to the new pindex to complete vm_page_insert(). */
+       m->pindex = new_pindex;
+       m->object = new_object;
+       vm_page_unlock(m);
+       vm_page_insert_radixdone(m, new_object, mpred);
        vm_page_dirty(m);
+       return (0);
 }
 
 /*
@@ -1182,14 +1303,7 @@ vm_page_cache_free(vm_object_t object, v
                if (end != 0 && m->pindex >= end)
                        break;
                vm_radix_remove(&object->cache, m->pindex);
-               m->object = NULL;
-               m->valid = 0;
-               /* Clear PG_CACHED and set PG_FREE. */
-               m->flags ^= PG_CACHED | PG_FREE;
-               KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
-                   ("vm_page_cache_free: page %p has inconsistent flags", m));
-               cnt.v_cache_count--;
-               vm_phys_freecnt_adj(m, 1);
+               vm_page_cache_turn_free(m);
        }
        empty = vm_radix_is_empty(&object->cache);
        mtx_unlock(&vm_page_queue_free_mtx);
@@ -1269,7 +1383,8 @@ vm_page_cache_transfer(vm_object_t orig_
                /* Update the page's object and offset. */
                m->object = new_object;
                m->pindex -= offidxstart;
-               vm_radix_insert(&new_object->cache, m);
+               if (vm_radix_insert(&new_object->cache, m))
+                       vm_page_cache_turn_free(m);
        }
        mtx_unlock(&vm_page_queue_free_mtx);
 }
@@ -1361,7 +1476,13 @@ vm_page_alloc(vm_object_t object, vm_pin
                KASSERT(mpred == NULL || mpred->pindex != pindex,
                   ("vm_page_alloc: pindex already allocated"));
        }
-       mtx_lock(&vm_page_queue_free_mtx);
+
+       /*
+        * The page allocation request can came from consumers which already
+        * hold the free page queue mutex, like vm_page_insert() in
+        * vm_page_cache().
+        */
+       mtx_lock_flags(&vm_page_queue_free_mtx, MTX_RECURSE);
        if (cnt.v_free_count + cnt.v_cache_count > cnt.v_free_reserved ||
            (req_class == VM_ALLOC_SYSTEM &&
            cnt.v_free_count + cnt.v_cache_count > cnt.v_interrupt_free_min) ||
@@ -1486,11 +1607,20 @@ vm_page_alloc(vm_object_t object, vm_pin
        m->act_count = 0;
 
        if (object != NULL) {
+               if (vm_page_insert_after(m, object, pindex, mpred)) {
+                       /* See the comment below about hold count. */
+                       if (vp != NULL)
+                               vdrop(vp);
+                       pagedaemon_wakeup();
+                       m->object = NULL;
+                       vm_page_free(m);
+                       return (NULL);
+               }
+
                /* Ignore device objects; the pager sets "memattr" for them. */
                if (object->memattr != VM_MEMATTR_DEFAULT &&
                    (object->flags & OBJ_FICTITIOUS) == 0)
                        pmap_page_set_memattr(m, object->memattr);
-               vm_page_insert_after(m, object, pindex, mpred);
        } else
                m->pindex = pindex;
 
@@ -1557,7 +1687,7 @@ vm_page_alloc_contig(vm_object_t object,
     vm_paddr_t boundary, vm_memattr_t memattr)
 {
        struct vnode *drop;
-       vm_page_t deferred_vdrop_list, m, m_ret;
+       vm_page_t deferred_vdrop_list, m, m_tmp, m_ret;
        u_int flags, oflags;
        int req_class;
 
@@ -1660,12 +1790,29 @@ retry:
                        m->wire_count = 1;
                /* Unmanaged pages don't use "act_count". */
                m->oflags = oflags;
+               if (object != NULL) {
+                       if (vm_page_insert(m, object, pindex)) {
+                               while (deferred_vdrop_list != NULL) {
+               vdrop((struct vnode *)deferred_vdrop_list->pageq.tqe_prev);
+                                       deferred_vdrop_list =
+                                           deferred_vdrop_list->pageq.tqe_next;
+                               }
+                               if (vm_paging_needed())
+                                       pagedaemon_wakeup();
+                               for (m = m_ret, m_tmp = m_ret;
+                                   m < &m_ret[npages]; m++) {
+                                       if (m_tmp < m)
+                                               m_tmp++;
+                                       else
+                                               m->object = NULL;
+                                       vm_page_free(m);
+                               }
+                               return (NULL);
+                       }
+               } else
+                       m->pindex = pindex;
                if (memattr != VM_MEMATTR_DEFAULT)
                        pmap_page_set_memattr(m, memattr);
-               if (object != NULL)
-                       vm_page_insert(m, object, pindex);
-               else
-                       m->pindex = pindex;
                pindex++;
        }
        while (deferred_vdrop_list != NULL) {
@@ -2042,6 +2189,28 @@ vm_page_free_wakeup(void)
 }
 
 /*
+ *     Turn a cached page into a free page, by changing its attributes.
+ *     Keep the statistics up-to-date.
+ *
+ *     The free page queue must be locked.
+ */
+static void
+vm_page_cache_turn_free(vm_page_t m)
+{
+
+       mtx_assert(&vm_page_queue_free_mtx, MA_OWNED);
+
+       m->object = NULL;
+       m->valid = 0;
+       /* Clear PG_CACHED and set PG_FREE. */
+       m->flags ^= PG_CACHED | PG_FREE;
+       KASSERT((m->flags & (PG_CACHED | PG_FREE)) == PG_FREE,
+           ("vm_page_cache_free: page %p has inconsistent flags", m));
+       cnt.v_cache_count--;
+       vm_phys_freecnt_adj(m, 1);
+}
+
+/*
  *     vm_page_free_toq:
  *
  *     Returns the given page to the free list,
@@ -2343,7 +2512,6 @@ vm_page_cache(vm_page_t m)
        }
        KASSERT((m->flags & PG_CACHED) == 0,
            ("vm_page_cache: page %p is already cached", m));
-       PCPU_INC(cnt.v_tcached);
 
        /*
         * Remove the page from the paging queues.
@@ -2370,10 +2538,18 @@ vm_page_cache(vm_page_t m)
         */
        m->flags &= ~PG_ZERO;
        mtx_lock(&vm_page_queue_free_mtx);
+       cache_was_empty = vm_radix_is_empty(&object->cache);
+       if (vm_radix_insert(&object->cache, m)) {
+               mtx_unlock(&vm_page_queue_free_mtx);
+               if (object->resident_page_count == 0)
+                       vdrop(object->handle);
+               m->object = NULL;
+               vm_page_free(m);
+               return;
+       }
        m->flags |= PG_CACHED;
        cnt.v_cache_count++;
-       cache_was_empty = vm_radix_is_empty(&object->cache);
-       vm_radix_insert(&object->cache, m);
+       PCPU_INC(cnt.v_tcached);
 #if VM_NRESERVLEVEL > 0
        if (!vm_reserv_free_page(m)) {
 #else
@@ -2946,11 +3122,8 @@ vm_page_cowfault(vm_page_t m)
        pindex = m->pindex;
 
  retry_alloc:
-       pmap_remove_all(m);
-       vm_page_remove(m);
-       mnew = vm_page_alloc(object, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOBUSY);
+       mnew = vm_page_alloc(NULL, pindex, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ);
        if (mnew == NULL) {
-               vm_page_insert(m, object, pindex);
                vm_page_unlock(m);
                VM_OBJECT_WUNLOCK(object);
                VM_WAIT;
@@ -2976,8 +3149,14 @@ vm_page_cowfault(vm_page_t m)
                vm_page_lock(mnew);
                vm_page_free(mnew);
                vm_page_unlock(mnew);
-               vm_page_insert(m, object, pindex);
        } else { /* clear COW & copy page */
+               pmap_remove_all(m);
+               mnew->object = object;
+               if (object->memattr != VM_MEMATTR_DEFAULT &&
+                   (object->flags & OBJ_FICTITIOUS) == 0)
+                       pmap_page_set_memattr(mnew, object->memattr);
+               if (vm_page_replace(mnew, object, pindex) != m)
+                       panic("vm_page_cowfault: invalid page replacement");
                if (!so_zerocp_fullpage)
                        pmap_copy_page(m, mnew);
                mnew->valid = VM_PAGE_BITS_ALL;

Modified: head/sys/vm/vm_page.h
==============================================================================
--- head/sys/vm/vm_page.h       Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/vm/vm_page.h       Fri Aug  9 11:28:55 2013        (r254141)
@@ -438,7 +438,7 @@ void vm_page_dequeue_locked(vm_page_t m)
 vm_page_t vm_page_find_least(vm_object_t, vm_pindex_t);
 vm_page_t vm_page_getfake(vm_paddr_t paddr, vm_memattr_t memattr);
 void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
-void vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
+int vm_page_insert (vm_page_t, vm_object_t, vm_pindex_t);
 boolean_t vm_page_is_cached(vm_object_t object, vm_pindex_t pindex);
 vm_page_t vm_page_lookup (vm_object_t, vm_pindex_t);
 vm_page_t vm_page_next(vm_page_t m);
@@ -449,7 +449,9 @@ void vm_page_putfake(vm_page_t m);
 void vm_page_readahead_finish(vm_page_t m);
 void vm_page_reference(vm_page_t m);
 void vm_page_remove (vm_page_t);
-void vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
+int vm_page_rename (vm_page_t, vm_object_t, vm_pindex_t);
+vm_page_t vm_page_replace(vm_page_t mnew, vm_object_t object,
+    vm_pindex_t pindex);
 void vm_page_requeue(vm_page_t m);
 void vm_page_requeue_locked(vm_page_t m);
 int vm_page_sbusied(vm_page_t m);

Modified: head/sys/vm/vm_radix.c
==============================================================================
--- head/sys/vm/vm_radix.c      Fri Aug  9 11:26:26 2013        (r254140)
+++ head/sys/vm/vm_radix.c      Fri Aug  9 11:28:55 2013        (r254141)
@@ -103,30 +103,16 @@ struct vm_radix_node {
 static uma_zone_t vm_radix_node_zone;
 
 /*
- * Allocate a radix node.  Pre-allocation should ensure that the request
- * will always be satisfied.
+ * Allocate a radix node.
  */
 static __inline struct vm_radix_node *
 vm_radix_node_get(vm_pindex_t owner, uint16_t count, uint16_t clevel)
 {
        struct vm_radix_node *rnode;
 
-       rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT);
-
-       /*
-        * The required number of nodes should already be pre-allocated
-        * by vm_radix_prealloc().  However, UMA can hold a few nodes
-        * in per-CPU buckets, which will not be accessible by the
-        * current CPU.  Thus, the allocation could return NULL when
-        * the pre-allocated pool is close to exhaustion.  Anyway,
-        * in practice this should never occur because a new node
-        * is not always required for insert.  Thus, the pre-allocated
-        * pool should have some extra pages that prevent this from
-        * becoming a problem.
-        */
+       rnode = uma_zalloc(vm_radix_node_zone, M_NOWAIT | M_ZERO);
        if (rnode == NULL)
-               panic("%s: uma_zalloc() returned NULL for a new node",
-                   __func__);
+               return (NULL);
        rnode->rn_owner = owner;
        rnode->rn_count = count;
        rnode->rn_clev = clevel;
@@ -295,39 +281,30 @@ vm_radix_node_zone_dtor(void *mem, int s
 }
 #endif
 
+#ifndef UMA_MD_SMALL_ALLOC
 /*
- * Radix node zone initializer.
- */
-static int
-vm_radix_node_zone_init(void *mem, int size __unused, int flags __unused)
-{
-       struct vm_radix_node *rnode;
-
-       rnode = mem;
-       memset(rnode->rn_child, 0, sizeof(rnode->rn_child));
-       return (0);
-}
-
-/*
- * Pre-allocate intermediate nodes from the UMA slab zone.
+ * Reserve the KVA necessary to satisfy the node allocation.
+ * This is mandatory in architectures not supporting direct
+ * mapping as they will need otherwise to carve into the kernel maps for
+ * every node allocation, resulting into deadlocks for consumers already
+ * working with kernel maps.
  */
 static void
-vm_radix_prealloc(void *arg __unused)
+vm_radix_reserve_kva(void *arg __unused)
 {
-       int nodes;
 
        /*
         * Calculate the number of reserved nodes, discounting the pages that
         * are needed to store them.
         */
-       nodes = ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
-           sizeof(struct vm_radix_node));
-       if (!uma_zone_reserve_kva(vm_radix_node_zone, nodes))
-               panic("%s: unable to create new zone", __func__);
-       uma_prealloc(vm_radix_node_zone, nodes);
+       if (!uma_zone_reserve_kva(vm_radix_node_zone,
+           ((vm_paddr_t)cnt.v_page_count * PAGE_SIZE) / (PAGE_SIZE +
+           sizeof(struct vm_radix_node))))
+               panic("%s: unable to reserve KVA", __func__);
 }
-SYSINIT(vm_radix_prealloc, SI_SUB_KMEM, SI_ORDER_SECOND, vm_radix_prealloc,
-    NULL);
+SYSINIT(vm_radix_reserve_kva, SI_SUB_KMEM, SI_ORDER_SECOND,
+    vm_radix_reserve_kva, NULL);
+#endif
 
 /*
  * Initialize the UMA slab zone.
@@ -345,15 +322,14 @@ vm_radix_init(void)
 #else
            NULL,
 #endif
-           vm_radix_node_zone_init, NULL, VM_RADIX_PAD, UMA_ZONE_VM |
-           UMA_ZONE_NOFREE);
+           NULL, NULL, VM_RADIX_PAD, UMA_ZONE_VM);
 }
 
 /*
  * Inserts the key-value pair into the trie.
  * Panics if the key already exists.
  */
-void
+int
 vm_radix_insert(struct vm_radix *rtree, vm_page_t page)
 {
        vm_pindex_t index, newind;
@@ -365,6 +341,8 @@ vm_radix_insert(struct vm_radix *rtree, 
 
        index = page->pindex;
 
+restart:
+
        /*
         * The owner of record for root is not really important because it
         * will never be used.
@@ -372,7 +350,7 @@ vm_radix_insert(struct vm_radix *rtree, 
        rnode = vm_radix_getroot(rtree);
        if (rnode == NULL) {
                rtree->rt_root = (uintptr_t)page | VM_RADIX_ISLEAF;
-               return;
+               return (0);
        }
        parentp = (void **)&rtree->rt_root;
        for (;;) {
@@ -382,19 +360,43 @@ vm_radix_insert(struct vm_radix *rtree, 
                                panic("%s: key %jx is already present",
                                    __func__, (uintmax_t)index);
                        clev = vm_radix_keydiff(m->pindex, index);
+
+                       /*
+                        * During node allocation the trie that is being
+                        * walked can be modified because of recursing radix
+                        * trie operations.
+                        * If this is the case, the recursing functions signal
+                        * such situation and the insert operation must
+                        * start from scratch again.
+                        * The freed radix node will then be in the UMA
+                        * caches very likely to avoid the same situation
+                        * to happen.
+                        */
+                       rtree->rt_flags |= RT_INSERT_INPROG;
                        tmp = vm_radix_node_get(vm_radix_trimkey(index,
                            clev + 1), 2, clev);
+                       rtree->rt_flags &= ~RT_INSERT_INPROG;
+                       if (tmp == NULL) {
+                               rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+                               return (ENOMEM);
+                       }
+                       if ((rtree->rt_flags & RT_TRIE_MODIFIED) != 0) {
+                               rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+                               tmp->rn_count = 0;
+                               vm_radix_node_put(tmp);
+                               goto restart;
+                       }
                        *parentp = tmp;
                        vm_radix_addpage(tmp, index, clev, page);
                        vm_radix_addpage(tmp, m->pindex, clev, m);
-                       return;
+                       return (0);
                } else if (vm_radix_keybarr(rnode, index))
                        break;
                slot = vm_radix_slot(index, rnode->rn_clev);
                if (rnode->rn_child[slot] == NULL) {
                        rnode->rn_count++;
                        vm_radix_addpage(rnode, index, rnode->rn_clev, page);
-                       return;
+                       return (0);
                }
                parentp = &rnode->rn_child[slot];
                rnode = rnode->rn_child[slot];
@@ -407,12 +409,26 @@ vm_radix_insert(struct vm_radix *rtree, 
         */
        newind = rnode->rn_owner;
        clev = vm_radix_keydiff(newind, index);
-       tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2,
-           clev);
+
+       /* See the comments above. */
+       rtree->rt_flags |= RT_INSERT_INPROG;
+       tmp = vm_radix_node_get(vm_radix_trimkey(index, clev + 1), 2, clev);
+       rtree->rt_flags &= ~RT_INSERT_INPROG;
+       if (tmp == NULL) {
+               rtree->rt_flags &= ~RT_TRIE_MODIFIED;
+               return (ENOMEM);

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
[email protected] mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "[email protected]"

Reply via email to