On 2009/07/25 06:31, Guilherme Malschitzky Schroeder wrote:
> I just saw that PG_RELEASED was back and tried it on my gateway
> running -current.
> Now, when i reboot it, many times i get a uvm_fault. Until this diff,
> no uvm's fault.

can you try backing out just PG_RELEASED in isolation from the rest of
the kernel, to be absolutely sure that this is what causes it?

$ cd /sys/uvm
$ TZ=Canada/Mountain cvs di -D '2009/07/22 15:05:40' \
        -D '2009/07/22 15:05:30' | patch


Index: uvm_aobj.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_aobj.c,v
retrieving revision 1.46
retrieving revision 1.45
diff -u -p -r1.46 -r1.45
--- uvm_aobj.c  22 Jul 2009 21:05:37 -0000      1.46
+++ uvm_aobj.c  16 Jun 2009 23:54:57 -0000      1.45
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_aobj.c,v 1.46 2009/07/22 21:05:37 oga Exp $       */
+/*     $OpenBSD: uvm_aobj.c,v 1.45 2009/06/16 23:54:57 oga Exp $       */
 /*     $NetBSD: uvm_aobj.c,v 1.39 2001/02/18 21:19:08 chs Exp $        */
 
 /*
@@ -139,7 +139,7 @@ struct pool uao_swhash_elt_pool;
  * uvm_aobj: the actual anon-backed uvm_object
  *
  * => the uvm_object is at the top of the structure, this allows
- *   (struct uvm_aobj *) == (struct uvm_object *)
+ *   (struct uvm_device *) == (struct uvm_object *)
  * => only one of u_swslots and u_swhash is used in any given aobj
  */
 
@@ -176,6 +176,8 @@ static void                  uao_free(struct uvm_aobj 
 static int                      uao_get(struct uvm_object *, voff_t,
                                     vm_page_t *, int *, int, vm_prot_t,
                                     int, int);
+static boolean_t                uao_releasepg(struct vm_page *,
+                                    struct vm_page **);
 static boolean_t                uao_pagein(struct uvm_aobj *, int, int);
 static boolean_t                uao_pagein_page(struct uvm_aobj *, int);
 
@@ -192,6 +194,10 @@ struct uvm_pagerops aobj_pager = {
        NULL,                   /* fault */
        uao_flush,              /* flush */
        uao_get,                /* get */
+       NULL,                   /* put (done by pagedaemon) */
+       NULL,                   /* cluster */
+       NULL,                   /* mk_pcluster */
+       uao_releasepg           /* releasepg */
 };
 
 /*
@@ -559,7 +565,7 @@ uao_init(void)
        simple_lock_init(&uao_list_lock);
 
        /*
-        * NOTE: Pages for this pool must not come from a pageable
+        * NOTE: Pages fror this pool must not come from a pageable
         * kernel map!
         */
        pool_init(&uao_swhash_elt_pool, sizeof(struct uao_swhash_elt),
@@ -635,7 +641,8 @@ void
 uao_detach_locked(struct uvm_object *uobj)
 {
        struct uvm_aobj *aobj = (struct uvm_aobj *)uobj;
-       struct vm_page *pg;
+       struct vm_page *pg, *next;
+       boolean_t busybody;
        UVMHIST_FUNC("uao_detach"); UVMHIST_CALLED(maphist);
 
        /*
@@ -662,26 +669,35 @@ uao_detach_locked(struct uvm_object *uob
        simple_unlock(&uao_list_lock);
 
        /*
-        * Free all pages left in the object. If they're busy, wait
-        * for them to become available before we kill it.
-        * Release swap resources then free the page.
+        * free all the pages that aren't PG_BUSY,
+        * mark for release any that are.
         */
-       uvm_lock_pageq();
-       while((pg = TAILQ_FIRST(&uobj->memq)) != NULL) {
+       busybody = FALSE;
+       for (pg = TAILQ_FIRST(&uobj->memq); pg != NULL; pg = next) {
+               next = TAILQ_NEXT(pg, listq);
                if (pg->pg_flags & PG_BUSY) {
-                       atomic_setbits_int(&pg->pg_flags, PG_WANTED);
-                       uvm_unlock_pageq();
-                       UVM_UNLOCK_AND_WAIT(pg, &uobj->vmobjlock, 0,
-                           "uao_det", 0);
-                       simple_lock(&uobj->vmobjlock);
-                       uvm_lock_pageq();
+                       atomic_setbits_int(&pg->pg_flags, PG_RELEASED);
+                       busybody = TRUE;
                        continue;
                }
+
+               /* zap the mappings, free the swap slot, free the page */
                pmap_page_protect(pg, VM_PROT_NONE);
                uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);
+               uvm_lock_pageq();
                uvm_pagefree(pg);
+               uvm_unlock_pageq();
+       }
+
+       /*
+        * if we found any busy pages, we're done for now.
+        * mark the aobj for death, releasepg will finish up for us.
+        */
+       if (busybody) {
+               aobj->u_flags |= UAO_FLAG_KILLME;
+               simple_unlock(&aobj->u_obj.vmobjlock);
+               return;
        }
-       uvm_unlock_pageq();
 
        /*
         * finally, free the rest.
@@ -713,6 +729,35 @@ uao_detach_locked(struct uvm_object *uob
  * => we return TRUE unless we encountered some sort of I/O error
  *     XXXJRT currently never happens, as we never directly initiate
  *     XXXJRT I/O
+ *
+ * comment on "cleaning" object and PG_BUSY pages:
+ *     this routine is holding the lock on the object.  the only time
+ *     that is can run into a PG_BUSY page that it does not own is if
+ *     some other process has started I/O on the page (e.g. either
+ *     a pagein or a pageout).  if the PG_BUSY page is being paged
+ *     in, then it can not be dirty (!PG_CLEAN) because no one has
+ *     had a change to modify it yet.  if the PG_BUSY page is being
+ *     paged out then it means that someone else has already started
+ *     cleaning the page for us (how nice!).  in this case, if we
+ *     have syncio specified, then after we make our pass through the
+ *     object we need to wait for the other PG_BUSY pages to clear
+ *     off (i.e. we need to do an iosync).  also note that once a
+ *     page is PG_BUSY is must stary in its object until it is un-busyed.
+ *     XXXJRT We never actually do this, as we are "flushing" anonymous
+ *     XXXJRT memory, which doesn't have persistent backing store.
+ *
+ * note on page traversal:
+ *     we can traverse the pages in an object either by going down the
+ *     linked list in "uobj->memq", or we can go over the address range
+ *     by page doing hash table lookups for each address.  depending
+ *     on how many pages are in the object it may be cheaper to do one
+ *     or the other.  we set "by_list" to true if we are using memq.
+ *     if the cost of a hash lookup was equal to the cost of the list
+ *     traversal we could compare the number of pages in the start->stop
+ *     range to the total number of pages in the object.  however, it
+ *     seems that a hash table lookup is more expensive than the linked
+ *     list traversal, so we multiply the number of pages in the
+ *     start->stop range by a penalty which we define below.
  */
 
 #define        UAO_HASH_PENALTY 4      /* XXX: a guess */
@@ -721,13 +766,19 @@ boolean_t
 uao_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
 {
        struct uvm_aobj *aobj = (struct uvm_aobj *) uobj;
-       struct vm_page *pp;
+       struct vm_page *pp, *ppnext;
+       boolean_t retval, by_list;
        voff_t curoff;
        UVMHIST_FUNC("uao_flush"); UVMHIST_CALLED(maphist);
 
+       curoff = 0;     /* XXX: shut up gcc */
+
+       retval = TRUE;  /* default to success */
+
        if (flags & PGO_ALLPAGES) {
                start = 0;
                stop = aobj->u_pages << PAGE_SHIFT;
+               by_list = TRUE;         /* always go by the list */
        } else {
                start = trunc_page(start);
                stop = round_page(stop);
@@ -736,10 +787,13 @@ uao_flush(struct uvm_object *uobj, voff_
                            "flush (fixed)\n");
                        stop = aobj->u_pages << PAGE_SHIFT;
                }
+               by_list = (uobj->uo_npages <=
+                   ((stop - start) >> PAGE_SHIFT) * UAO_HASH_PENALTY);
        }
 
-       UVMHIST_LOG(maphist, " flush start=0x%lx, stop=0x%lx, flags=0x%lx",
-           (u_long)start, (u_long)stop, flags, 0);
+       UVMHIST_LOG(maphist,
+           " flush start=0x%lx, stop=0x%lx, by_list=%ld, flags=0x%lx",
+           (u_long)start, (u_long)stop, by_list, flags);
 
        /*
         * Don't need to do any work here if we're not freeing
@@ -748,31 +802,44 @@ uao_flush(struct uvm_object *uobj, voff_
        if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {
                UVMHIST_LOG(maphist,
                    "<- done (no work to do)",0,0,0,0);
-               return (TRUE);
+               return (retval);
        }
 
-       /* locked: uobj */
-       curoff = start;
-       for (;;) {
-               if (curoff < stop) {
-                       pp = uvm_pagelookup(uobj, curoff);
-                       curoff += PAGE_SIZE;
-                       if (pp == NULL)
+       /*
+        * now do it.  note: we must update ppnext in the body of loop or we
+        * will get stuck.  we need to use ppnext because we may free "pp"
+        * before doing the next loop.
+        */
+
+       if (by_list) {
+               pp = TAILQ_FIRST(&uobj->memq);
+       } else {
+               curoff = start;
+               pp = uvm_pagelookup(uobj, curoff);
+       }
+
+       ppnext = NULL;  /* XXX: shut up gcc */
+       uvm_lock_pageq();       /* page queues locked */
+
+       /* locked: both page queues and uobj */
+       for ( ; (by_list && pp != NULL) ||
+           (!by_list && curoff < stop) ; pp = ppnext) {
+               if (by_list) {
+                       ppnext = TAILQ_NEXT(pp, listq);
+
+                       /* range check */
+                       if (pp->offset < start || pp->offset >= stop)
                                continue;
                } else {
-                       break;
-               }
+                       curoff += PAGE_SIZE;
+                       if (curoff < stop)
+                               ppnext = uvm_pagelookup(uobj, curoff);
 
-               /* Make sure page is unbusy, else wait for it. */
-               if (pp->pg_flags & PG_BUSY) {
-                       atomic_setbits_int(&pp->pg_flags, PG_WANTED);
-                       UVM_UNLOCK_AND_WAIT(pp, &uobj->vmobjlock, 0,
-                           "uaoflsh", 0);
-                       simple_lock(&uobj->vmobjlock);
-                       curoff -= PAGE_SIZE;
-                       continue;
+                       /* null check */
+                       if (pp == NULL)
+                               continue;
                }
-
+               
                switch (flags & (PGO_CLEANIT|PGO_FREE|PGO_DEACTIVATE)) {
                /*
                 * XXX In these first 3 cases, we always just
@@ -781,9 +848,7 @@ uao_flush(struct uvm_object *uobj, voff_
                 * XXX in the future.
                 */
                case PGO_CLEANIT|PGO_FREE:
-                       /* FALLTHROUGH */
                case PGO_CLEANIT|PGO_DEACTIVATE:
-                       /* FALLTHROUGH */
                case PGO_DEACTIVATE:
  deactivate_it:
                        /* skip the page if it's loaned or wired */
@@ -791,13 +856,16 @@ uao_flush(struct uvm_object *uobj, voff_
                            pp->wire_count != 0)
                                continue;
 
-                       uvm_lock_pageq();
+#ifdef UBC
+                       /* ...and deactivate the page. */
+                       pmap_clear_reference(pp);
+#else
                        /* zap all mappings for the page. */
                        pmap_page_protect(pp, VM_PROT_NONE);
 
                        /* ...and deactivate the page. */
+#endif
                        uvm_pagedeactivate(pp);
-                       uvm_unlock_pageq();
 
                        continue;
 
@@ -814,13 +882,19 @@ uao_flush(struct uvm_object *uobj, voff_
                            pp->wire_count != 0)
                                continue;
 
+                       /*
+                        * mark the page as released if its busy.
+                        */
+                       if (pp->pg_flags & PG_BUSY) {
+                               atomic_setbits_int(&pp->pg_flags, PG_RELEASED);
+                               continue;
+                       }
+
                        /* zap all mappings for the page. */
                        pmap_page_protect(pp, VM_PROT_NONE);
 
                        uao_dropswap(uobj, pp->offset >> PAGE_SHIFT);
-                       uvm_lock_pageq();
                        uvm_pagefree(pp);
-                       uvm_unlock_pageq();
 
                        continue;
 
@@ -829,9 +903,11 @@ uao_flush(struct uvm_object *uobj, voff_
                }
        }
 
+       uvm_unlock_pageq();
+
        UVMHIST_LOG(maphist,
            "<- done, rv=%ld",retval,0,0,0);
-       return (TRUE);
+       return (retval);
 }
 
 /*
@@ -913,10 +989,10 @@ uao_get(struct uvm_object *uobj, voff_t 
                        }
 
                        /*
-                        * to be useful must get a non-busy page
+                        * to be useful must get a non-busy, non-released page
                         */
                        if (ptmp == NULL ||
-                           (ptmp->pg_flags & PG_BUSY) != 0) {
+                           (ptmp->pg_flags & (PG_BUSY|PG_RELEASED)) != 0) {
                                if (lcv == centeridx ||
                                    (flags & PGO_ALLPAGES) != 0)
                                        /* need to do a wait or I/O! */
@@ -1023,7 +1099,7 @@ uao_get(struct uvm_object *uobj, voff_t 
                        }
 
                        /* page is there, see if we need to wait on it */
-                       if ((ptmp->pg_flags & PG_BUSY) != 0) {
+                       if ((ptmp->pg_flags & (PG_BUSY|PG_RELEASED)) != 0) {
                                atomic_setbits_int(&ptmp->pg_flags, PG_WANTED);
                                UVMHIST_LOG(pdhist,
                                    "sleeping, ptmp->flags 0x%lx\n",
@@ -1062,7 +1138,8 @@ uao_get(struct uvm_object *uobj, voff_t 
                /*
                 * just zero the page if there's nothing in swap.
                 */
-               if (swslot == 0) {
+               if (swslot == 0)
+               {
                        /*
                         * page hasn't existed before, just zero it.
                         */
@@ -1139,6 +1216,65 @@ uao_get(struct uvm_object *uobj, voff_t 
 }
 
 /*
+ * uao_releasepg: handle released page in an aobj
+ * 
+ * => "pg" is a PG_BUSY [caller owns it], PG_RELEASED page that we need
+ *      to dispose of.
+ * => caller must handle PG_WANTED case
+ * => called with page's object locked, pageq's unlocked
+ * => returns TRUE if page's object is still alive, FALSE if we
+ *      killed the page's object.    if we return TRUE, then we
+ *      return with the object locked.
+ * => if (nextpgp != NULL) => we return the next page on the queue, and return
+ *                              with the page queues locked [for pagedaemon]
+ * => if (nextpgp == NULL) => we return with page queues unlocked [normal case]
+ * => we kill the aobj if it is not referenced and we are suppose to
+ *      kill it ("KILLME").
+ */
+static boolean_t
+uao_releasepg(struct vm_page *pg, struct vm_page **nextpgp /* OUT */)
+{
+       struct uvm_aobj *aobj = (struct uvm_aobj *) pg->uobject;
+
+       KASSERT(pg->pg_flags & PG_RELEASED);
+
+       /*
+        * dispose of the page [caller handles PG_WANTED] and swap slot.
+        */
+       pmap_page_protect(pg, VM_PROT_NONE);
+       uao_dropswap(&aobj->u_obj, pg->offset >> PAGE_SHIFT);
+       uvm_lock_pageq();
+       if (nextpgp)
+               *nextpgp = TAILQ_NEXT(pg, pageq); /* next page for daemon */
+       uvm_pagefree(pg);
+       if (!nextpgp)
+               uvm_unlock_pageq();             /* keep locked for daemon */
+
+       /*
+        * if we're not killing the object, we're done.
+        */
+       if ((aobj->u_flags & UAO_FLAG_KILLME) == 0)
+               return TRUE;
+       KASSERT(aobj->u_obj.uo_refs == 0);
+
+       /*
+        * if there are still pages in the object, we're done for now.
+        */
+       if (aobj->u_obj.uo_npages != 0)
+               return TRUE;
+
+       KASSERT(TAILQ_EMPTY(&aobj->u_obj.memq));
+
+       /*
+        * finally, free the rest.
+        */
+       uao_free(aobj);
+
+       return FALSE;
+}
+
+
+/*
  * uao_dropswap:  release any swap resources from this aobj page.
  * 
  * => aobj must be locked or have a reference count of 0.
@@ -1340,6 +1476,7 @@ uao_pagein_page(struct uvm_aobj *aobj, i
                return FALSE;
 
        }
+       KASSERT((pg->pg_flags & PG_RELEASED) == 0);
 
        /*
         * ok, we've got the page now.
Index: uvm_aobj.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_aobj.h,v
retrieving revision 1.12
retrieving revision 1.11
diff -u -p -r1.12 -r1.11
--- uvm_aobj.h  22 Jul 2009 21:05:37 -0000      1.12
+++ uvm_aobj.h  16 Jun 2009 23:54:57 -0000      1.11
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_aobj.h,v 1.12 2009/07/22 21:05:37 oga Exp $       */
+/*     $OpenBSD: uvm_aobj.h,v 1.11 2009/06/16 23:54:57 oga Exp $       */
 /*     $NetBSD: uvm_aobj.h,v 1.10 2000/01/11 06:57:49 chs Exp $        */
 
 /*
@@ -55,6 +55,8 @@
 #define UAO_FLAG_KERNSWAP      0x2     /* enable kernel swap */
 
 /* internal flags */
+#define UAO_FLAG_KILLME                0x4     /* aobj should die when last 
released
+                                        * page is no longer PG_BUSY ... */
 #define UAO_FLAG_NOSWAP                0x8     /* aobj can't swap (kernel obj 
only!) */
 
 #ifdef _KERNEL
Index: uvm_fault.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_fault.c,v
retrieving revision 1.58
retrieving revision 1.57
diff -u -p -r1.58 -r1.57
--- uvm_fault.c 22 Jul 2009 21:05:37 -0000      1.58
+++ uvm_fault.c 16 Jun 2009 23:54:58 -0000      1.57
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_fault.c,v 1.58 2009/07/22 21:05:37 oga Exp $      */
+/*     $OpenBSD: uvm_fault.c,v 1.57 2009/06/16 23:54:58 oga Exp $      */
 /*     $NetBSD: uvm_fault.c,v 1.51 2000/08/06 00:22:53 thorpej Exp $   */
 
 /*
@@ -921,10 +921,10 @@ ReFault:
 
                                /*
                                 * if center page is resident and not
-                                * PG_BUSY, then pgo_get made it PG_BUSY
-                                * for us and gave us a handle to it.
-                                * remember this page as "uobjpage."
-                                * (for later use).
+                                * PG_BUSY|PG_RELEASED then pgo_get
+                                * made it PG_BUSY for us and gave
+                                * us a handle to it.   remember this
+                                * page as "uobjpage." (for later use).
                                 */
                                
                                if (lcv == centeridx) {
@@ -966,8 +966,8 @@ ReFault:
                                     (wired ? PMAP_WIRED : 0));
 
                                /* 
-                                * NOTE: page can't be PG_WANTED because
-                                * we've held the lock the whole time
+                                * NOTE: page can't be PG_WANTED or PG_RELEASED
+                                * because we've held the lock the whole time
                                 * we've had the handle.
                                 */
 
@@ -1371,12 +1371,15 @@ Case2:
                /* locked(!locked): uobj, uobjpage */
 
                /*
-                * Re-verify that amap slot is still free. if there is
-                * a problem, we unlock and clean up.
+                * verify that the page has not be released and re-verify
+                * that amap slot is still free.   if there is a problem,
+                * we unlock and clean up.
                 */
 
-               if (locked && amap && amap_lookup(&ufi.entry->aref,
-                     ufi.orig_rvaddr - ufi.entry->start)) {
+               if ((uobjpage->pg_flags & PG_RELEASED) != 0 ||
+                   (locked && amap && 
+                   amap_lookup(&ufi.entry->aref,
+                     ufi.orig_rvaddr - ufi.entry->start))) {
                        if (locked) 
                                uvmfault_unlockall(&ufi, amap, NULL, NULL);
                        locked = FALSE;
@@ -1395,6 +1398,17 @@ Case2:
                                /* still holding object lock */
                                wakeup(uobjpage);
 
+                       if (uobjpage->pg_flags & PG_RELEASED) {
+                               uvmexp.fltpgrele++;
+                               KASSERT(uobj->pgops->pgo_releasepg != NULL);
+
+                               /* frees page */
+                               if (uobj->pgops->pgo_releasepg(uobjpage,NULL))
+                                       /* unlock if still alive */
+                                       simple_unlock(&uobj->vmobjlock);
+                               goto ReFault;
+                       }
+
                        uvm_lock_pageq();
                        /* make sure it is in queues */
                        uvm_pageactivate(uobjpage);
@@ -1409,8 +1423,9 @@ Case2:
                }
 
                /*
-                * we have the data in uobjpage which is PG_BUSY and we are
-                * holding object lock.
+                * we have the data in uobjpage which is PG_BUSY and
+                * !PG_RELEASED.  we are holding object lock (so the page
+                * can't be released on us).
                 */
 
                /* locked: maps(read), amap(if !null), uobj, uobjpage */
@@ -1424,6 +1439,8 @@ Case2:
        /*
         * notes:
         *  - at this point uobjpage can not be NULL
+        *  - at this point uobjpage can not be PG_RELEASED (since we checked
+        *  for it above)
         *  - at this point uobjpage could be PG_WANTED (handle later)
         */
                
@@ -1610,7 +1627,9 @@ Case2:
                        }
                        
                        /*
-                        * dispose of uobjpage. drop handle to uobj as well.
+                        * dispose of uobjpage.  it can't be PG_RELEASED
+                        * since we still hold the object lock.
+                        * drop handle to uobj as well.
                         */
 
                        if (uobjpage->pg_flags & PG_WANTED)
@@ -1673,6 +1692,11 @@ Case2:
                if (pg->pg_flags & PG_WANTED)
                        wakeup(pg);             /* lock still held */
 
+               /* 
+                * note that pg can't be PG_RELEASED since we did not drop
+                * the object lock since the last time we checked.
+                */
+ 
                atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_FAKE|PG_WANTED);
                UVM_PAGE_OWN(pg, NULL);
                uvmfault_unlockall(&ufi, amap, uobj, NULL);
@@ -1712,6 +1736,11 @@ Case2:
        if (pg->pg_flags & PG_WANTED)
                wakeup(pg);             /* lock still held */
 
+       /* 
+        * note that pg can't be PG_RELEASED since we did not drop the object 
+        * lock since the last time we checked.
+        */
+ 
        atomic_clearbits_int(&pg->pg_flags, PG_BUSY|PG_FAKE|PG_WANTED);
        UVM_PAGE_OWN(pg, NULL);
        uvmfault_unlockall(&ufi, amap, uobj, NULL);
Index: uvm_km.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_km.c,v
retrieving revision 1.74
retrieving revision 1.73
diff -u -p -r1.74 -r1.73
--- uvm_km.c    22 Jul 2009 21:05:37 -0000      1.74
+++ uvm_km.c    17 Jun 2009 00:13:59 -0000      1.73
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_km.c,v 1.74 2009/07/22 21:05:37 oga Exp $ */
+/*     $OpenBSD: uvm_km.c,v 1.73 2009/06/17 00:13:59 oga Exp $ */
 /*     $NetBSD: uvm_km.c,v 1.42 2001/01/14 02:10:01 thorpej Exp $      */
 
 /* 
@@ -276,12 +276,8 @@ uvm_km_pgremove(struct uvm_object *uobj,
                    pp->pg_flags & PG_BUSY, 0, 0);
 
                if (pp->pg_flags & PG_BUSY) {
-                       atomic_setbits_int(&pp->pg_flags, PG_WANTED);
-                       UVM_UNLOCK_AND_WAIT(pp, &uobj->vmobjlock, 0,
-                           "km_pgrm", 0);
-                       simple_lock(&uobj->vmobjlock);
-                       curoff -= PAGE_SIZE; /* loop back to us */
-                       continue;
+                       /* owner must check for this when done */
+                       atomic_setbits_int(&pp->pg_flags, PG_RELEASED);
                } else {
                        /* free the swap slot... */
                        uao_dropswap(uobj, curoff >> PAGE_SHIFT);
@@ -515,6 +511,21 @@ uvm_km_alloc1(struct vm_map *map, vsize_
        loopva = kva;
        while (size) {
                simple_lock(&uvm.kernel_object->vmobjlock);
+               pg = uvm_pagelookup(uvm.kernel_object, offset);
+
+               /*
+                * if we found a page in an unallocated region, it must be
+                * released
+                */
+               if (pg) {
+                       if ((pg->pg_flags & PG_RELEASED) == 0)
+                               panic("uvm_km_alloc1: non-released page");
+                       atomic_setbits_int(&pg->pg_flags, PG_WANTED);
+                       UVM_UNLOCK_AND_WAIT(pg, &uvm.kernel_object->vmobjlock,
+                           FALSE, "km_alloc", 0);
+                       continue;   /* retry */
+               }
+               
                /* allocate ram */
                pg = uvm_pagealloc(uvm.kernel_object, offset, NULL, 0);
                if (pg) {
Index: uvm_loan.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_loan.c,v
retrieving revision 1.34
retrieving revision 1.33
diff -u -p -r1.34 -r1.33
--- uvm_loan.c  22 Jul 2009 21:05:37 -0000      1.34
+++ uvm_loan.c  17 Jun 2009 00:13:59 -0000      1.33
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_loan.c,v 1.34 2009/07/22 21:05:37 oga Exp $       */
+/*     $OpenBSD: uvm_loan.c,v 1.33 2009/06/17 00:13:59 oga Exp $       */
 /*     $NetBSD: uvm_loan.c,v 1.22 2000/06/27 17:29:25 mrg Exp $        */
 
 /*
@@ -462,12 +462,14 @@ uvm_loanuobj(struct uvm_faultinfo *ufi, 
                simple_lock(&uobj->vmobjlock);
 
                /*
-                * Re-verify that amap slot is still free. if there is a
-                * problem we drop our lock (thus force a lookup refresh/retry).
+                * verify that the page has not be released and re-verify
+                * that amap slot is still free.   if there is a problem we
+                * drop our lock (thus force a lookup refresh/retry).
                 */
                        
-               if (locked && amap && amap_lookup(&ufi->entry->aref,
-                   ufi->orig_rvaddr - ufi->entry->start)) {
+               if ((pg->pg_flags & PG_RELEASED) != 0 ||
+                   (locked && amap && amap_lookup(&ufi->entry->aref,
+                   ufi->orig_rvaddr - ufi->entry->start))) {
                        
                        if (locked)
                                uvmfault_unlockall(ufi, amap, NULL, NULL);
@@ -484,6 +486,17 @@ uvm_loanuobj(struct uvm_faultinfo *ufi, 
                                /* still holding object lock */
                                wakeup(pg);
 
+                       if (pg->pg_flags & PG_RELEASED) {
+#ifdef DIAGNOSTIC
+                               if (uobj->pgops->pgo_releasepg == NULL)
+                       panic("uvm_loanuobj: object has no releasepg function");
+#endif
+                               /* frees page */
+                               if (uobj->pgops->pgo_releasepg(pg, NULL))
+                                       simple_unlock(&uobj->vmobjlock);
+                               return (0);
+                       }
+
                        uvm_lock_pageq();
                        uvm_pageactivate(pg); /* make sure it is in queues */
                        uvm_unlock_pageq();
@@ -496,7 +509,8 @@ uvm_loanuobj(struct uvm_faultinfo *ufi, 
 
        /*
         * at this point we have the page we want ("pg") marked PG_BUSY for us
-        * and we have all data structures locked.   do the loanout.
+        * and we have all data structures locked.   do the loanout.   page can
+        * not be PG_RELEASED (we caught this above).
         */
 
        if ((flags & UVM_LOAN_TOANON) == 0) {   /* loan to wired-kernel page? */
Index: uvm_mmap.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
retrieving revision 1.78
retrieving revision 1.77
diff -u -p -r1.78 -r1.77
--- uvm_mmap.c  22 Jul 2009 21:05:37 -0000      1.78
+++ uvm_mmap.c  9 Jul 2009 22:29:56 -0000       1.77
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_mmap.c,v 1.78 2009/07/22 21:05:37 oga Exp $       */
+/*     $OpenBSD: uvm_mmap.c,v 1.77 2009/07/09 22:29:56 thib Exp $      */
 /*     $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $        */
 
 /*
@@ -298,7 +298,8 @@ sys_mincore(struct proc *p, void *v, reg
                 */
                if (UVM_ET_ISOBJ(entry)) {
                        KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
-                       if (entry->object.uvm_obj->pgops->pgo_fault != NULL) {
+                       if (entry->object.uvm_obj->pgops->pgo_releasepg
+                           == NULL) {
                                pgi = 1;
                                for (/* nothing */; start < lim;
                                     start += PAGE_SIZE, vec++)
Index: uvm_page.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_page.c,v
retrieving revision 1.92
retrieving revision 1.91
diff -u -p -r1.92 -r1.91
--- uvm_page.c  22 Jul 2009 21:05:37 -0000      1.92
+++ uvm_page.c  17 Jun 2009 00:13:59 -0000      1.91
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_page.c,v 1.92 2009/07/22 21:05:37 oga Exp $       */
+/*     $OpenBSD: uvm_page.c,v 1.91 2009/06/17 00:13:59 oga Exp $       */
 /*     $NetBSD: uvm_page.c,v 1.44 2000/11/27 08:40:04 chs Exp $        */
 
 /* 
@@ -1245,14 +1245,7 @@ uvm_page_unbusy(struct vm_page **pgs, in
                        UVMHIST_LOG(pdhist, "releasing pg %p", pg,0,0,0);
                        uobj = pg->uobject;
                        if (uobj != NULL) {
-                               uvm_lock_pageq();
-                               pmap_page_protect(pg, VM_PROT_NONE);
-                               /* XXX won't happen right now */
-                               if (pg->pg_flags & PQ_ANON)
-                                       uao_dropswap(uobj,
-                                           pg->offset >> PAGE_SHIFT);
-                               uvm_pagefree(pg);
-                               uvm_unlock_pageq();
+                               uobj->pgops->pgo_releasepg(pg, NULL);
                        } else {
                                atomic_clearbits_int(&pg->pg_flags, PG_BUSY);
                                UVM_PAGE_OWN(pg, NULL);
Index: uvm_pager.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.c,v
retrieving revision 1.54
retrieving revision 1.53
diff -u -p -r1.54 -r1.53
--- uvm_pager.c 22 Jul 2009 21:05:37 -0000      1.54
+++ uvm_pager.c 17 Jun 2009 00:13:59 -0000      1.53
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_pager.c,v 1.54 2009/07/22 21:05:37 oga Exp $      */
+/*     $OpenBSD: uvm_pager.c,v 1.53 2009/06/17 00:13:59 oga Exp $      */
 /*     $NetBSD: uvm_pager.c,v 1.36 2000/11/27 18:26:41 chs Exp $       */
 
 /*
@@ -339,8 +339,6 @@ uvm_pagermapout(vaddr_t kva, int npages)
  *      PGO_ALLPAGES:  all pages in object are valid targets
  *      !PGO_ALLPAGES: use "lo" and "hi" to limit range of cluster
  *      PGO_DOACTCLUST: include active pages in cluster.
- *     PGO_FREE: set the PG_RELEASED bits on the cluster so they'll be freed
- *             in async io (caller must clean on error).
  *        NOTE: the caller should clear PG_CLEANCHK bits if PGO_DOACTCLUST.
  *              PG_CLEANCHK is only a hint, but clearing will help reduce
  *             the number of calls we make to the pmap layer.
@@ -442,14 +440,6 @@ uvm_mk_pcluster(struct uvm_object *uobj,
                        atomic_setbits_int(&pclust->pg_flags, PG_BUSY);
                        UVM_PAGE_OWN(pclust, "uvm_mk_pcluster");
 
-                       /*
-                        * If we want to free after io is done, and we're
-                        * async, set the released flag
-                        */
-                       if ((flags & (PGO_FREE|PGO_SYNCIO)) == PGO_FREE)
-                               atomic_setbits_int(&pclust->pg_flags,
-                                   PG_RELEASED);
-
                        /* XXX: protect wired page?   see above comment. */
                        pmap_page_protect(pclust, VM_PROT_READ);
                        if (!forward) {
@@ -491,7 +481,6 @@ uvm_mk_pcluster(struct uvm_object *uobj,
  *     PGO_DOACTCLUST: include "PQ_ACTIVE" pages as valid targets
  *     PGO_SYNCIO: do SYNC I/O (no async)
  *     PGO_PDFREECLUST: pagedaemon: drop cluster on successful I/O
- *     PGO_FREE: tell the aio daemon to free pages in the async case.
  * => start/stop: if (uobj && !PGO_ALLPAGES) limit targets to this range
  *               if (!uobj) start is the (daddr64_t) of the starting swapblk
  * => return state:
@@ -715,6 +704,8 @@ uvm_pager_dropcluster(struct uvm_object 
     struct vm_page **ppsp, int *npages, int flags)
 {
        int lcv;
+       boolean_t obj_is_alive; 
+       struct uvm_object *saved_uobj;
 
        /*
         * drop all pages but "pg"
@@ -756,8 +747,9 @@ uvm_pager_dropcluster(struct uvm_object 
                }
 
                /* if page was released, release it.  otherwise un-busy it */
-               if (ppsp[lcv]->pg_flags & PG_RELEASED &&
-                   ppsp[lcv]->pg_flags & PQ_ANON) {
+               if (ppsp[lcv]->pg_flags & PG_RELEASED) {
+
+                       if (ppsp[lcv]->pg_flags & PQ_ANON) {
                                /* so that anfree will free */
                                atomic_clearbits_int(&ppsp[lcv]->pg_flags,
                                    PG_BUSY);
@@ -769,13 +761,34 @@ uvm_pager_dropcluster(struct uvm_object 
                                uvm_anfree(ppsp[lcv]->uanon);
 
                                continue;
-               } else {
+                       }
+
+                       /*
+                        * pgo_releasepg will dump the page for us
+                        */
+
+                       saved_uobj = ppsp[lcv]->uobject;
+                       obj_is_alive =
+                           saved_uobj->pgops->pgo_releasepg(ppsp[lcv], NULL);
+                       
+                       /* for normal objects, "pg" is still PG_BUSY by us,
+                        * so obj can't die */
+                       KASSERT(!uobj || obj_is_alive);
+
+                       /* only unlock the object if it is still alive...  */
+                       if (obj_is_alive && saved_uobj != uobj)
+                               simple_unlock(&saved_uobj->vmobjlock);
+
                        /*
-                        * if we were planning on async io then we would
-                        * have PG_RELEASED set, clear that with the others.
+                        * XXXCDC: suppose uobj died in the pgo_releasepg?
+                        * how pass that
+                        * info up to caller.  we are currently ignoring it...
                         */
+
+                       continue;               /* next page */
+               } else {
                        atomic_clearbits_int(&ppsp[lcv]->pg_flags,
-                           PG_BUSY|PG_WANTED|PG_FAKE|PG_RELEASED);
+                           PG_BUSY|PG_WANTED|PG_FAKE);
                        UVM_PAGE_OWN(ppsp[lcv], NULL);
                }
 
@@ -798,6 +811,33 @@ uvm_pager_dropcluster(struct uvm_object 
                }
        }
 }
+
+#ifdef UBC
+/*
+ * interrupt-context iodone handler for nested i/o bufs.
+ *
+ * => must be at splbio().
+ */
+
+void
+uvm_aio_biodone1(struct buf *bp)
+{
+       struct buf *mbp = bp->b_private;
+
+       splassert(IPL_BIO);
+
+       KASSERT(mbp != bp);
+       if (bp->b_flags & B_ERROR) {
+               mbp->b_flags |= B_ERROR;
+               mbp->b_error = bp->b_error;
+       }
+       mbp->b_resid -= bp->b_bcount;
+       pool_put(&bufpool, bp);
+       if (mbp->b_resid == 0) {
+               biodone(mbp);
+       }
+}
+#endif
 
 /*
  * interrupt-context iodone handler for single-buf i/os
Index: uvm_pager.h
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pager.h,v
retrieving revision 1.27
retrieving revision 1.26
diff -u -p -r1.27 -r1.26
--- uvm_pager.h 22 Jul 2009 21:05:37 -0000      1.27
+++ uvm_pager.h 17 Jun 2009 00:13:59 -0000      1.26
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_pager.h,v 1.27 2009/07/22 21:05:37 oga Exp $      */
+/*     $OpenBSD: uvm_pager.h,v 1.26 2009/06/17 00:13:59 oga Exp $      */
 /*     $NetBSD: uvm_pager.h,v 1.20 2000/11/27 08:40:05 chs Exp $       */
 
 /*
@@ -109,6 +109,8 @@ struct uvm_pagerops {
        struct vm_page **       (*pgo_mk_pcluster)(struct uvm_object *,
                                 struct vm_page **, int *, struct vm_page *,
                                 int, voff_t, voff_t);
+                                               /* release page */
+       boolean_t               (*pgo_releasepg)(struct vm_page *, struct 
vm_page **);
 };
 
 /* pager flags [mostly for flush] */
Index: uvm_pdaemon.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_pdaemon.c,v
retrieving revision 1.52
retrieving revision 1.51
diff -u -p -r1.52 -r1.51
--- uvm_pdaemon.c       22 Jul 2009 21:05:37 -0000      1.52
+++ uvm_pdaemon.c       26 Jun 2009 20:26:02 -0000      1.51
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_pdaemon.c,v 1.52 2009/07/22 21:05:37 oga Exp $    */
+/*     $OpenBSD: uvm_pdaemon.c,v 1.51 2009/06/26 20:26:02 oga Exp $    */
 /*     $NetBSD: uvm_pdaemon.c,v 1.23 2000/08/20 10:24:14 bjh21 Exp $   */
 
 /* 
@@ -820,25 +820,40 @@ uvmpd_scan_inactive(struct pglist *pglst
                        atomic_clearbits_int(&p->pg_flags, PG_BUSY|PG_WANTED);
                        UVM_PAGE_OWN(p, NULL);
 
-                       /* released during I/O? Can only happen for anons */
+                       /* released during I/O? */
                        if (p->pg_flags & PG_RELEASED) {
-                               KASSERT(anon != NULL);
-                               /*
-                                * remove page so we can get nextpg,
-                                * also zero out anon so we don't use
-                                * it after the free.
-                                */
-                               anon->an_page = NULL;
-                               p->uanon = NULL;
+                               if (anon) {
+                                       /*
+                                        * remove page so we can get nextpg,
+                                        * also zero out anon so we don't use
+                                        * it after the free.
+                                        */
+                                       anon->an_page = NULL;
+                                       p->uanon = NULL;
 
-                               simple_unlock(&anon->an_lock);
-                               uvm_anfree(anon);       /* kills anon */
-                               pmap_page_protect(p, VM_PROT_NONE);
-                               anon = NULL;
-                               uvm_lock_pageq();
-                               nextpg = TAILQ_NEXT(p, pageq);
-                               /* free released page */
-                               uvm_pagefree(p);
+                                       simple_unlock(&anon->an_lock);
+                                       uvm_anfree(anon);       /* kills anon */
+                                       pmap_page_protect(p, VM_PROT_NONE);
+                                       anon = NULL;
+                                       uvm_lock_pageq();
+                                       nextpg = TAILQ_NEXT(p, pageq);
+                                       /* free released page */
+                                       uvm_pagefree(p);
+
+                               } else {
+
+                                       /*
+                                        * pgo_releasepg nukes the page and
+                                        * gets "nextpg" for us.  it returns
+                                        * with the page queues locked (when
+                                        * given nextpg ptr).
+                                        */
+
+                                       if (!uobj->pgops->pgo_releasepg(p,
+                                           &nextpg))
+                                               /* uobj died after release */
+                                               uobj = NULL;
+                               }
                        } else {        /* page was not released during I/O */
                                uvm_lock_pageq();
                                nextpg = TAILQ_NEXT(p, pageq);
Index: uvm_vnode.c
===================================================================
RCS file: /cvs/src/sys/uvm/uvm_vnode.c,v
retrieving revision 1.68
retrieving revision 1.67
diff -u -p -r1.68 -r1.67
--- uvm_vnode.c 22 Jul 2009 21:05:37 -0000      1.68
+++ uvm_vnode.c 9 Jul 2009 22:29:56 -0000       1.67
@@ -1,4 +1,4 @@
-/*     $OpenBSD: uvm_vnode.c,v 1.68 2009/07/22 21:05:37 oga Exp $      */
+/*     $OpenBSD: uvm_vnode.c,v 1.67 2009/07/09 22:29:56 thib Exp $     */
 /*     $NetBSD: uvm_vnode.c,v 1.36 2000/11/24 20:34:01 chs Exp $       */
 
 /*
@@ -93,6 +93,7 @@ void           uvn_init(void);
 int             uvn_io(struct uvm_vnode *, vm_page_t *, int, int, int);
 int             uvn_put(struct uvm_object *, vm_page_t *, int, boolean_t);
 void            uvn_reference(struct uvm_object *);
+boolean_t       uvn_releasepg(struct vm_page *, struct vm_page **);
 
 /*
  * master pager structure
@@ -108,6 +109,7 @@ struct uvm_pagerops uvm_vnodeops = {
        uvn_put,
        uvn_cluster,
        uvm_mk_pcluster, /* use generic version of this: see uvm_pager.c */
+       uvn_releasepg,
 };
 
 /*
@@ -413,23 +415,30 @@ uvn_detach(struct uvm_object *uobj)
        /*
         * given the structure of this pager, the above flush request will
         * create the following state: all the pages that were in the object
-        * have either been free'd or they are marked PG_BUSY and in the 
-        * middle of an async io. If we still have pages we set the "relkill"
-        * state, so that in the case the vnode gets terminated we know 
-        * to leave it alone. Otherwise we'll kill the vnode when it's empty.
+        * have either been free'd or they are marked PG_BUSY|PG_RELEASED.
+        * the PG_BUSY bit was set either by us or the daemon for async I/O.
+        * in either case, if we have pages left we can't kill the object
+        * yet because i/o is pending.  in this case we set the "relkill"
+        * flag which will cause pgo_releasepg to kill the object once all
+        * the I/O's are done [pgo_releasepg will be called from the aiodone
+        * routine or from the page daemon].
         */
 
-       uvn->u_flags |= UVM_VNODE_RELKILL;
-       /* wait on any outstanding io */
-       while (uobj->uo_npages && uvn->u_flags & UVM_VNODE_RELKILL) {
-               uvn->u_flags |= UVM_VNODE_IOSYNC;
-               UVM_UNLOCK_AND_WAIT(&uvn->u_nio, &uvn->u_obj.vmobjlock, FALSE,
-                   "uvn_term",0);
-               simple_lock(&uvn->u_obj.vmobjlock);
-       }
-
-       if ((uvn->u_flags & UVM_VNODE_RELKILL) == 0)
+       if (uobj->uo_npages) {          /* I/O pending.  iodone will free */
+#ifdef DEBUG
+               /*
+                * XXXCDC: very unlikely to happen until we have async i/o
+                * so print a little info message in case it does.
+                */
+               printf("uvn_detach: vn %p has pages left after flush - "
+                   "relkill mode\n", uobj);
+#endif
+               uvn->u_flags |= UVM_VNODE_RELKILL;
+               simple_unlock(&uobj->vmobjlock);
+               UVMHIST_LOG(maphist,"<- done! (releasepg will kill obj)", 0, 0,
+                   0, 0);
                return;
+       }
 
        /*
         * kill object now.   note that we can't be on the sync q because
@@ -481,6 +490,8 @@ uvn_detach(struct uvm_object *uobj)
  * => the caller must XLOCK and VOP_LOCK the vnode before calling us
  *     [protects us from getting a vnode that is already in the DYING
  *      state...]
+ * => unlike uvn_detach, this function must not return until all the
+ *     uvn's pages are disposed of.
  * => in case [2] the uvn is still alive after this call, but all I/O
  *     ops will fail (due to the backing vnode now being "dead").  this
  *     will prob. kill any process using the uvn due to pgo_get failing.
@@ -524,8 +535,8 @@ uvm_vnp_terminate(struct vnode *vp)
 
        /*
         * it is possible that the uvn was detached and is in the relkill
-        * state [i.e. waiting for async i/o to finish].
-        * we take over the vnode now and cancel the relkill.
+        * state [i.e. waiting for async i/o to finish so that releasepg can
+        * kill object].  we take over the vnode now and cancel the relkill.
         * we want to know when the i/o is done so we can recycle right
         * away.   note that a uvn can only be in the RELKILL state if it
         * has a zero reference count.
@@ -619,6 +630,72 @@ uvm_vnp_terminate(struct vnode *vp)
 }
 
 /*
+ * uvn_releasepg: handled a released page in a uvn
+ *
+ * => "pg" is a PG_BUSY [caller owns it], PG_RELEASED page that we need
+ *     to dispose of.
+ * => caller must handled PG_WANTED case
+ * => called with page's object locked, pageq's unlocked
+ * => returns TRUE if page's object is still alive, FALSE if we
+ *     killed the page's object.    if we return TRUE, then we
+ *     return with the object locked.
+ * => if (nextpgp != NULL) => we return pageq.tqe_next here, and return
+ *                             with the page queues locked [for pagedaemon]
+ * => if (nextpgp == NULL) => we return with page queues unlocked [normal case]
+ * => we kill the uvn if it is not referenced and we are suppose to
+ *     kill it ("relkill").
+ */
+
+boolean_t
+uvn_releasepg(struct vm_page *pg, struct vm_page **nextpgp /* OUT */)
+{
+       struct uvm_vnode *uvn = (struct uvm_vnode *) pg->uobject;
+       struct vnode *vp = (struct vnode *)uvn;
+#ifdef DIAGNOSTIC
+       if ((pg->pg_flags & PG_RELEASED) == 0)
+               panic("uvn_releasepg: page not released!");
+#endif
+
+       /*
+        * dispose of the page [caller handles PG_WANTED]
+        */
+       pmap_page_protect(pg, VM_PROT_NONE);
+       uvm_lock_pageq();
+       if (nextpgp)
+               *nextpgp = TAILQ_NEXT(pg, pageq); /* next page for daemon */
+       uvm_pagefree(pg);
+       if (!nextpgp)
+               uvm_unlock_pageq();
+
+       /*
+        * now see if we need to kill the object
+        */
+       if (uvn->u_flags & UVM_VNODE_RELKILL) {
+               if (uvn->u_obj.uo_refs)
+                       panic("uvn_releasepg: kill flag set on referenced "
+                           "object!");
+               if (uvn->u_obj.uo_npages == 0) {
+                       if (uvn->u_flags & UVM_VNODE_WRITEABLE) {
+                               LIST_REMOVE(uvn, u_wlist);
+                       }
+#ifdef DIAGNOSTIC
+                       if (!TAILQ_EMPTY(&uvn->u_obj.memq))
+       panic("uvn_releasepg: pages in object with npages == 0");
+#endif
+                       if (uvn->u_flags & UVM_VNODE_WANTED)
+                               /* still holding object lock */
+                               wakeup(uvn);
+
+                       uvn->u_flags = 0;               /* DEAD! */
+                       simple_unlock(&uvn->u_obj.vmobjlock);
+                       vrele(vp);
+                       return (FALSE);
+               }
+       }
+       return (TRUE);
+}
+
+/*
  * NOTE: currently we have to use VOP_READ/VOP_WRITE because they go
  * through the buffer cache and allow I/O in any size.  These VOPs use
  * synchronous i/o.  [vs. VOP_STRATEGY which can be async, but doesn't
@@ -652,6 +729,8 @@ uvm_vnp_terminate(struct vnode *vp)
  * - if (object->iosync && u_naio == 0) { wakeup &uvn->u_naio }
  * - get "page" structures (atop?).
  * - handle "wanted" pages
+ * - handle "released" pages [using pgo_releasepg]
+ *   >>> pgo_releasepg may kill the object
  * dont forget to look at "object" wanted flag in all cases.
  */
 
@@ -713,13 +792,15 @@ boolean_t
 uvn_flush(struct uvm_object *uobj, voff_t start, voff_t stop, int flags)
 {
        struct uvm_vnode *uvn = (struct uvm_vnode *) uobj;
-       struct vm_page *pp, *ptmp;
+       struct vm_page *pp, *ppnext, *ptmp;
        struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
        int npages, result, lcv;
-       boolean_t retval, need_iosync, needs_clean;
+       boolean_t retval, need_iosync, by_list, needs_clean, all;
        voff_t curoff;
+       u_short pp_version;
        UVMHIST_FUNC("uvn_flush"); UVMHIST_CALLED(maphist);
 
+       curoff = 0;     /* XXX: shut up gcc */
        /*
         * get init vals and determine how we are going to traverse object
         */
@@ -727,16 +808,24 @@ uvn_flush(struct uvm_object *uobj, voff_
        need_iosync = FALSE;
        retval = TRUE;          /* return value */
        if (flags & PGO_ALLPAGES) {
-               start = 0;
-               stop = round_page(uvn->u_size);
+               all = TRUE;
+               by_list = TRUE;         /* always go by the list */
        } else {
                start = trunc_page(start);
-               stop = MIN(round_page(stop), round_page(uvn->u_size));
+               stop = round_page(stop);
+#ifdef DEBUG
+               if (stop > round_page(uvn->u_size))
+                       printf("uvn_flush: strange, got an out of range "
+                           "flush (fixed)\n");
+#endif
+               all = FALSE;
+               by_list = (uobj->uo_npages <=
+                   ((stop - start) >> PAGE_SHIFT) * UVN_HASH_PENALTY);
        }
 
        UVMHIST_LOG(maphist,
-           " flush start=0x%lx, stop=0x%lx, flags=0x%lx",
-           (u_long)start, (u_long)stop, flags, 0);
+           " flush start=0x%lx, stop=0x%lx, by_list=%ld, flags=0x%lx",
+           (u_long)start, (u_long)stop, by_list, flags);
 
        /*
         * PG_CLEANCHK: this bit is used by the pgo_mk_pcluster function as
@@ -749,21 +838,75 @@ uvn_flush(struct uvm_object *uobj, voff_
         * [borrowed PG_CLEANCHK idea from FreeBSD VM]
         */
 
-       if ((flags & PGO_CLEANIT) != 0) {
-               KASSERT(uobj->pgops->pgo_mk_pcluster != 0);
-               for (curoff = start ; curoff < stop; curoff += PAGE_SIZE) {
-                       if ((pp = uvm_pagelookup(uobj, curoff)) != NULL)
+       if ((flags & PGO_CLEANIT) != 0 &&
+           uobj->pgops->pgo_mk_pcluster != NULL) {
+               if (by_list) {
+                       TAILQ_FOREACH(pp, &uobj->memq, listq) {
+                               if (!all &&
+                                   (pp->offset < start || pp->offset >= stop))
+                                       continue;
                                atomic_clearbits_int(&pp->pg_flags,
                                    PG_CLEANCHK);
+                       }
+
+               } else {   /* by hash */
+                       for (curoff = start ; curoff < stop;
+                           curoff += PAGE_SIZE) {
+                               pp = uvm_pagelookup(uobj, curoff);
+                               if (pp)
+                                       atomic_clearbits_int(&pp->pg_flags,
+                                           PG_CLEANCHK);
+                       }
                }
        }
 
+       /*
+        * now do it.   note: we must update ppnext in body of loop or we
+        * will get stuck.  we need to use ppnext because we may free "pp"
+        * before doing the next loop.
+        */
+
+       if (by_list) {
+               pp = TAILQ_FIRST(&uobj->memq);
+       } else {
+               curoff = start;
+               pp = uvm_pagelookup(uobj, curoff);
+       }
+
+       ppnext = NULL;  /* XXX: shut up gcc */
        ppsp = NULL;            /* XXX: shut up gcc */
        uvm_lock_pageq();       /* page queues locked */
+
        /* locked: both page queues and uobj */
-       for (curoff = start; curoff < stop; curoff += PAGE_SIZE) {
-               if ((pp = uvm_pagelookup(uobj, curoff)) == NULL)
-                       continue;
+       for ( ; (by_list && pp != NULL) ||
+         (!by_list && curoff < stop) ; pp = ppnext) {
+
+               if (by_list) {
+
+                       /*
+                        * range check
+                        */
+
+                       if (!all &&
+                           (pp->offset < start || pp->offset >= stop)) {
+                               ppnext = TAILQ_NEXT(pp, listq);
+                               continue;
+                       }
+
+               } else {
+
+                       /*
+                        * null check
+                        */
+
+                       curoff += PAGE_SIZE;
+                       if (pp == NULL) {
+                               if (curoff < stop)
+                                       ppnext = uvm_pagelookup(uobj, curoff);
+                               continue;
+                       }
+
+               }
 
                /*
                 * handle case where we do not need to clean page (either
@@ -800,32 +943,37 @@ uvn_flush(struct uvm_object *uobj, voff_
                }
 
                /*
-                * if we don't need a clean... deactivate/free pages then cont.
+                * if we don't need a clean... load ppnext and dispose of pp
                 */
                if (!needs_clean) {
+                       /* load ppnext */
+                       if (by_list)
+                               ppnext = TAILQ_NEXT(pp, listq);
+                       else {
+                               if (curoff < stop)
+                                       ppnext = uvm_pagelookup(uobj, curoff);
+                       }
+
+                       /* now dispose of pp */
                        if (flags & PGO_DEACTIVATE) {
                                if ((pp->pg_flags & PQ_INACTIVE) == 0 &&
                                    pp->wire_count == 0) {
                                        pmap_page_protect(pp, VM_PROT_NONE);
                                        uvm_pagedeactivate(pp);
                                }
+
                        } else if (flags & PGO_FREE) {
                                if (pp->pg_flags & PG_BUSY) {
+                                       /* release busy pages */
                                        atomic_setbits_int(&pp->pg_flags,
-                                           PG_WANTED);
-                                       uvm_unlock_pageq();
-                                       UVM_UNLOCK_AND_WAIT(pp,
-                                           &uobj->vmobjlock, 0, "uvn_flsh", 0);
-                                       simple_lock(&uobj->vmobjlock);
-                                       uvm_lock_pageq();
-                                       curoff -= PAGE_SIZE;
-                                       continue;
+                                           PG_RELEASED);
                                } else {
                                        pmap_page_protect(pp, VM_PROT_NONE);
                                        /* removed page from object */
                                        uvm_pagefree(pp);
                                }
                        }
+                       /* ppnext is valid so we can continue... */
                        continue;
                }
 
@@ -841,9 +989,7 @@ uvn_flush(struct uvm_object *uobj, voff_
                atomic_setbits_int(&pp->pg_flags, PG_BUSY);
                UVM_PAGE_OWN(pp, "uvn_flush");
                pmap_page_protect(pp, VM_PROT_READ);
-               /* if we're async, free the page in aiodoned */
-               if ((flags & (PGO_FREE|PGO_SYNCIO)) == PGO_FREE)
-                       atomic_setbits_int(&pp->pg_flags, PG_RELEASED);
+               pp_version = pp->pg_version;
 ReTry:
                ppsp = pps;
                npages = sizeof(pps) / sizeof(struct vm_page *);
@@ -854,11 +1000,11 @@ ReTry:
                /* unlocked: page queues, uobj */
 
                /*
-                * if we did an async I/O it is remotely possible for the
-                * async i/o to complete and the page "pp" be freed or what
-                * not before we get a chance to relock the object. Therefore,
-                * we only touch it when it won't be freed, RELEASED took care
-                * of the rest.
+                * at this point nothing is locked.   if we did an async I/O
+                * it is remotely possible for the async i/o to complete and
+                * the page "pp" be freed or what not before we get a chance
+                * to relock the object.   in order to detect this, we have
+                * saved the version number of the page in "pp_version".
                 */
 
                /* relock! */
@@ -867,7 +1013,7 @@ ReTry:
 
                /*
                 * VM_PAGER_AGAIN: given the structure of this pager, this
-                * can only happen when we are doing async I/O and can't
+                * can only happen when  we are doing async I/O and can't
                 * map the pages into kernel memory (pager_map) due to lack
                 * of vm space.   if this happens we drop back to sync I/O.
                 */
@@ -885,10 +1031,6 @@ ReTry:
        panic("uvn_flush: PGO_SYNCIO return 'try again' error (impossible)");
 #endif
                        flags |= PGO_SYNCIO;
-                       if (flags & PGO_FREE)
-                               atomic_clearbits_int(&pp->pg_flags,
-                                   PG_RELEASED);
-
                        goto ReTry;
                }
 
@@ -900,20 +1042,66 @@ ReTry:
                 */
 
                /*
-                * for pending async i/o if we are not deactivating
-                * we can move on to the next page. aiodoned deals with
-                * the freeing case for us.
+                * for pending async i/o if we are not deactivating/freeing
+                * we can move on to the next page.
                 */
-               if (result == VM_PAGER_PEND && (flags & PGO_DEACTIVATE) == 0)
-                       continue;
+
+               if (result == VM_PAGER_PEND) {
+
+                       if ((flags & (PGO_DEACTIVATE|PGO_FREE)) == 0) {
+                               /*
+                                * no per-page ops: refresh ppnext and continue
+                                */
+                               if (by_list) {
+                                       if (pp->pg_version == pp_version)
+                                               ppnext = TAILQ_NEXT(pp, listq);
+                                       else
+                                               /* reset */
+                                               ppnext = 
TAILQ_FIRST(&uobj->memq);
+                               } else {
+                                       if (curoff < stop)
+                                               ppnext = uvm_pagelookup(uobj,
+                                                   curoff);
+                               }
+                               continue;
+                       }
+
+                       /* need to do anything here? */
+               }
 
                /*
-                * need to look at each page of the I/O operation, and do what
-                * we gotta do.
+                * need to look at each page of the I/O operation.  we defer
+                * processing "pp" until the last trip through this "for" loop
+                * so that we can load "ppnext" for the main loop after we
+                * play with the cluster pages [thus the "npages + 1" in the
+                * loop below].
                 */
 
-               for (lcv = 0 ; lcv < npages; lcv++) {
-                       ptmp = ppsp[lcv];
+               for (lcv = 0 ; lcv < npages + 1 ; lcv++) {
+
+                       /*
+                        * handle ppnext for outside loop, and saving pp
+                        * until the end.
+                        */
+                       if (lcv < npages) {
+                               if (ppsp[lcv] == pp)
+                                       continue; /* skip pp until the end */
+                               ptmp = ppsp[lcv];
+                       } else {
+                               ptmp = pp;
+
+                               /* set up next page for outer loop */
+                               if (by_list) {
+                                       if (pp->pg_version == pp_version)
+                                               ppnext = TAILQ_NEXT(pp, listq);
+                                       else
+                                               /* reset */
+                                               ppnext = 
TAILQ_FIRST(&uobj->memq);
+                               } else {
+                                       if (curoff < stop)
+                                       ppnext = uvm_pagelookup(uobj, curoff);
+                               }
+                       }
 
                        /*
                         * verify the page didn't get moved while obj was
@@ -937,10 +1125,25 @@ ReTry:
                                atomic_clearbits_int(&ptmp->pg_flags,
                                    PG_WANTED|PG_BUSY);
                                UVM_PAGE_OWN(ptmp, NULL);
-                               atomic_setbits_int(&ptmp->pg_flags,
-                                   PG_CLEAN|PG_CLEANCHK);
-                               if ((flags & PGO_FREE) == 0)
-                                       pmap_clear_modify(ptmp);
+                               if (ptmp->pg_flags & PG_RELEASED) {
+
+                                       /*
+                                        * pgo_releasepg needs to grab the
+                                        * pageq lock itself.
+                                        */
+                                       uvm_unlock_pageq();
+                                       if (!uvn_releasepg(ptmp, NULL))
+                                               return (TRUE);
+
+                                       uvm_lock_pageq();       /* relock */
+                                       continue;               /* next page */
+
+                               } else {
+                                       atomic_setbits_int(&ptmp->pg_flags,
+                                           PG_CLEAN|PG_CLEANCHK);
+                                       if ((flags & PGO_FREE) == 0)
+                                               pmap_clear_modify(ptmp);
+                               }
                        }
 
                        /*
@@ -953,21 +1156,29 @@ ReTry:
                                        pmap_page_protect(ptmp, VM_PROT_NONE);
                                        uvm_pagedeactivate(ptmp);
                                }
-                       } else if (flags & PGO_FREE &&
-                           result != VM_PAGER_PEND) {
-                               if (result != VM_PAGER_OK) {
-                                       printf("uvn_flush: obj=%p, "
-                                          "offset=0x%llx.  error "
-                                          "during pageout.\n",
-                                           pp->uobject,
-                                           (long long)pp->offset);
-                                       printf("uvn_flush: WARNING: "
-                                           "changes to page may be "
-                                           "lost!\n");
-                                       retval = FALSE;
+
+                       } else if (flags & PGO_FREE) {
+                               if (result == VM_PAGER_PEND) {
+                                       if ((ptmp->pg_flags & PG_BUSY) != 0)
+                                               /* signal for i/o done */
+                                               atomic_setbits_int(
+                                                   &ptmp->pg_flags,
+                                                   PG_RELEASED);
+                               } else {
+                                       if (result != VM_PAGER_OK) {
+                                               printf("uvn_flush: obj=%p, "
+                                                  "offset=0x%llx.  error "
+                                                  "during pageout.\n",
+                                                   pp->uobject,
+                                                   (long long)pp->offset);
+                                               printf("uvn_flush: WARNING: "
+                                                   "changes to page may be "
+                                                   "lost!\n");
+                                               retval = FALSE;
+                                       }
+                                       pmap_page_protect(ptmp, VM_PROT_NONE);
+                                       uvm_pagefree(ptmp);
                                }
-                               pmap_page_protect(ptmp, VM_PROT_NONE);
-                               uvm_pagefree(ptmp);
                        }
 
                }               /* end of "lcv" for loop */
@@ -1110,7 +1321,7 @@ uvn_get(struct uvm_object *uobj, voff_t 
 
                        /* to be useful must get a non-busy, non-released pg */
                        if (ptmp == NULL ||
-                           (ptmp->pg_flags & PG_BUSY) != 0) {
+                           (ptmp->pg_flags & (PG_BUSY|PG_RELEASED)) != 0) {
                                if (lcv == centeridx || (flags & PGO_ALLPAGES)
                                    != 0)
                                done = FALSE;   /* need to do a wait or I/O! */
@@ -1216,7 +1427,7 @@ uvn_get(struct uvm_object *uobj, voff_t 
                        }
 
                        /* page is there, see if we need to wait on it */
-                       if ((ptmp->pg_flags & PG_BUSY) != 0) {
+                       if ((ptmp->pg_flags & (PG_BUSY|PG_RELEASED)) != 0) {
                                atomic_setbits_int(&ptmp->pg_flags, PG_WANTED);
                                UVM_UNLOCK_AND_WAIT(ptmp,
                                    &uobj->vmobjlock, FALSE, "uvn_get",0);

Reply via email to