On Fri, May 25, 2001 at 08:29:38PM -0400, Ben LaHaise wrote:
> amount of bounce buffers to guarentee progress while submitting io.  The
> -ac kernels have a patch from Ingo that provides private pools for bounce
> buffers and buffer_heads.  I went a step further and have a memory
> reservation patch that provides for memory pools being reserved against a
> particular zone.  This is needed to prevent the starvation that irq
> allocations can cause.
> 
> Some of these cleanups are 2.5 fodder, but we really need something in 2.4
> right now, so...

Please merge this one in 2.4 for now (originally from Ingo, I only
improved it), this is a real definitive fix and there's no nicer way to
handle that unless you want to generalize an API for people to generate
private anti-deadlock ("make sure to always make a progress") memory
pools:

diff -urN 2.4.4/mm/highmem.c highmem-deadlock/mm/highmem.c
--- 2.4.4/mm/highmem.c  Sat Apr 28 05:24:48 2001
+++ highmem-deadlock/mm/highmem.c       Sat Apr 28 18:21:24 2001
@@ -159,6 +159,19 @@
        spin_unlock(&kmap_lock);
 }
 
+#define POOL_SIZE 32
+
+/*
+ * This lock gets no contention at all, normally.
+ */
+static spinlock_t emergency_lock = SPIN_LOCK_UNLOCKED;
+
+int nr_emergency_pages;
+static LIST_HEAD(emergency_pages);
+
+int nr_emergency_bhs;
+static LIST_HEAD(emergency_bhs);
+
 /*
  * Simple bounce buffer support for highmem pages.
  * This will be moved to the block layer in 2.5.
@@ -203,17 +216,72 @@
 
 static inline void bounce_end_io (struct buffer_head *bh, int uptodate)
 {
+       struct page *page;
        struct buffer_head *bh_orig = (struct buffer_head *)(bh->b_private);
+       unsigned long flags;
 
        bh_orig->b_end_io(bh_orig, uptodate);
-       __free_page(bh->b_page);
+
+       page = bh->b_page;
+
+       spin_lock_irqsave(&emergency_lock, flags);
+       if (nr_emergency_pages >= POOL_SIZE)
+               __free_page(page);
+       else {
+               /*
+                * We are abusing page->list to manage
+                * the highmem emergency pool:
+                */
+               list_add(&page->list, &emergency_pages);
+               nr_emergency_pages++;
+       }
+       
+       if (nr_emergency_bhs >= POOL_SIZE) {
 #ifdef HIGHMEM_DEBUG
-       /* Don't clobber the constructed slab cache */
-       init_waitqueue_head(&bh->b_wait);
+               /* Don't clobber the constructed slab cache */
+               init_waitqueue_head(&bh->b_wait);
 #endif
-       kmem_cache_free(bh_cachep, bh);
+               kmem_cache_free(bh_cachep, bh);
+       } else {
+               /*
+                * Ditto in the bh case, here we abuse b_inode_buffers:
+                */
+               list_add(&bh->b_inode_buffers, &emergency_bhs);
+               nr_emergency_bhs++;
+       }
+       spin_unlock_irqrestore(&emergency_lock, flags);
 }
 
+static __init int init_emergency_pool(void)
+{
+       spin_lock_irq(&emergency_lock);
+       while (nr_emergency_pages < POOL_SIZE) {
+               struct page * page = alloc_page(GFP_ATOMIC);
+               if (!page) {
+                       printk("couldn't refill highmem emergency pages");
+                       break;
+               }
+               list_add(&page->list, &emergency_pages);
+               nr_emergency_pages++;
+       }
+       while (nr_emergency_bhs < POOL_SIZE) {
+               struct buffer_head * bh = kmem_cache_alloc(bh_cachep, SLAB_ATOMIC);
+               if (!bh) {
+                       printk("couldn't refill highmem emergency bhs");
+                       break;
+               }
+               list_add(&bh->b_inode_buffers, &emergency_bhs);
+               nr_emergency_bhs++;
+       }
+       spin_unlock_irq(&emergency_lock);
+       printk("allocated %d pages and %d bhs reserved for the highmem bounces\n",
+              nr_emergency_pages, nr_emergency_bhs);
+
+       return 0;
+}
+
+__initcall(init_emergency_pool);
+
 static void bounce_end_io_write (struct buffer_head *bh, int uptodate)
 {
        bounce_end_io(bh, uptodate);
@@ -228,6 +296,82 @@
        bounce_end_io(bh, uptodate);
 }
 
+struct page *alloc_bounce_page (void)
+{
+       struct list_head *tmp;
+       struct page *page;
+
+repeat_alloc:
+       page = alloc_page(GFP_BUFFER);
+       if (page)
+               return page;
+       /*
+        * No luck. First, kick the VM so it doesnt idle around while
+        * we are using up our emergency rations.
+        */
+       wakeup_bdflush(0);
+
+       /*
+        * Try to allocate from the emergency pool.
+        */
+       tmp = &emergency_pages;
+       spin_lock_irq(&emergency_lock);
+       if (!list_empty(tmp)) {
+               page = list_entry(tmp->next, struct page, list);
+               list_del(tmp->next);
+               nr_emergency_pages--;
+       }
+       spin_unlock_irq(&emergency_lock);
+       if (page)
+               return page;
+
+       /* we need to wait I/O completion */
+       run_task_queue(&tq_disk);
+
+       current->policy |= SCHED_YIELD;
+       __set_current_state(TASK_RUNNING);
+       schedule();
+       goto repeat_alloc;
+}
+
+struct buffer_head *alloc_bounce_bh (void)
+{
+       struct list_head *tmp;
+       struct buffer_head *bh;
+
+repeat_alloc:
+       bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
+       if (bh)
+               return bh;
+       /*
+        * No luck. First, kick the VM so it doesnt idle around while
+        * we are using up our emergency rations.
+        */
+       wakeup_bdflush(0);
+
+       /*
+        * Try to allocate from the emergency pool.
+        */
+       tmp = &emergency_bhs;
+       spin_lock_irq(&emergency_lock);
+       if (!list_empty(tmp)) {
+               bh = list_entry(tmp->next, struct buffer_head, b_inode_buffers);
+               list_del(tmp->next);
+               nr_emergency_bhs--;
+       }
+       spin_unlock_irq(&emergency_lock);
+       if (bh)
+               return bh;
+
+       /* we need to wait I/O completion */
+       run_task_queue(&tq_disk);
+
+       current->policy |= SCHED_YIELD;
+       __set_current_state(TASK_RUNNING);
+       schedule();
+       goto repeat_alloc;
+}
+
 struct buffer_head * create_bounce(int rw, struct buffer_head * bh_orig)
 {
        struct page *page;
@@ -236,24 +380,15 @@
        if (!PageHighMem(bh_orig->b_page))
                return bh_orig;
 
-repeat_bh:
-       bh = kmem_cache_alloc(bh_cachep, SLAB_BUFFER);
-       if (!bh) {
-               wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
-               goto repeat_bh;
-       }
+       bh = alloc_bounce_bh();
        /*
         * This is wasteful for 1k buffers, but this is a stopgap measure
         * and we are being ineffective anyway. This approach simplifies
         * things immensly. On boxes with more than 4GB RAM this should
         * not be an issue anyway.
         */
-repeat_page:
-       page = alloc_page(GFP_BUFFER);
-       if (!page) {
-               wakeup_bdflush(1);  /* Sets task->state to TASK_RUNNING */
-               goto repeat_page;
-       }
+       page = alloc_bounce_page();
+
        set_bh_page(bh, page, 0);
 
        bh->b_next = NULL;


And this one as well to avoid tight loops in getblk without reschedules
in between when normal zone is empty:

diff -urN 2.4.4pre1/fs/buffer.c 2.4.4pre1-blkdev/fs/buffer.c
--- 2.4.4pre1/fs/buffer.c       Sun Apr  1 01:17:30 2001
+++ 2.4.4pre1-blkdev/fs/buffer.c        Mon Apr  9 15:37:20 2001
@@ -628,7 +622,7 @@
    to do in order to release the ramdisk memory is to destroy dirty buffers.
 
    These are two special cases. Normal usage imply the device driver
-   to issue a sync on the device (without waiting I/O completation) and
+   to issue a sync on the device (without waiting I/O completion) and
    then an invalidate_buffers call that doesn't trash dirty buffers. */
 void __invalidate_buffers(kdev_t dev, int destroy_dirty_buffers)
 {
@@ -762,7 +756,12 @@
        balance_dirty(NODEV);
        if (free_shortage())
                page_launder(GFP_BUFFER, 0);
-       grow_buffers(size);
+       if (!grow_buffers(size)) {
+               wakeup_bdflush(1);
+               current->policy |= SCHED_YIELD;
+               __set_current_state(TASK_RUNNING);
+               schedule();
+       }
 }
 
 void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
@@ -1027,12 +1026,13 @@
        write_unlock(&hash_table_lock);
        spin_unlock(&lru_list_lock);
        refill_freelist(size);
+       /* FIXME: getblk should fail if there's no enough memory */
        goto repeat;
 }
 
 /* -1 -> no need to flush
     0 -> async flush
-    1 -> sync flush (wait for I/O completation) */
+    1 -> sync flush (wait for I/O completion) */
 int balance_dirty_state(kdev_t dev)
 {
        unsigned long dirty, tot, hard_dirty_limit, soft_dirty_limit;
@@ -1431,6 +1431,7 @@
 {
        struct buffer_head *bh, *head, *tail;
 
+       /* FIXME: create_buffers should fail if there's no enough memory */
        head = create_buffers(page, blocksize, 1);
        if (page->buffers)
                BUG();
@@ -2367,11 +2368,9 @@
        spin_lock(&free_list[index].lock);
        tmp = bh;
        do {
-               struct buffer_head *p = tmp;
-
-               tmp = tmp->b_this_page;
-               if (buffer_busy(p))
+               if (buffer_busy(tmp))
                        goto busy_buffer_page;
+               tmp = tmp->b_this_page;
        } while (tmp != bh);
 
        spin_lock(&unused_list_lock);

Andrea
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to