Re: [RFC] changes to buffer.c (was Test12 ll_rw_block error)

Chris Mason Tue, 26 Dec 2000 16:49:06 -0800

Hi guys,

Here's my latest code, which uses ll_rw_block for anon pages (or
pages without a writepage func) when flush_dirty_buffers, 
sync_buffers, or fsync_inode_buffers are flushing things.  This
seems to have fixed my slowdown on 1k buffer sizes, but I
haven't done extensive benchmarks yet.

Other changes:  After freeing a page with buffers, page_launder
now stops if (!free_shortage()).  This is a mod of the check where 
page_launder checked free_shortage after freeing a buffer cache 
page.  Code outside buffer.c can't detect buffer cache pages with 
this patch, so the old check doesn't apply.  

My change doesn't seem quite right though, if page_launder wants 
to stop when there isn't a shortage, it should do that regardless of
if the page it just freed had buffers.  It looks like this was added
so bdflush could call page_launder, and get an early out after
freeing some buffer heads, but I'm not sure.

In test13-pre4, invalidate_buffers skips buffers on a page
with a mapping.  I changed that to skip mappings other than the
anon space mapping.

Comments and/or suggestions on how to make better use of this stuff
are more than welcome ;-)

-chris

diff -urN linux-test13-pre4/fs/buffer.c linux-anon-space/fs/buffer.c
--- linux-test13-pre4/fs/buffer.c       Sat Dec 23 13:14:48 2000
+++ linux-anon-space/fs/buffer.c        Tue Dec 26 00:58:06 2000
@@ -97,6 +97,17 @@
 
 static int grow_buffers(int size);
 static void __refile_buffer(struct buffer_head *);
+static int block_write_anon_page(struct page *);
+static void end_buffer_io_async(struct buffer_head * bh, int uptodate) ;
+
+static struct address_space_operations anon_space_ops = {
+       writepage: block_write_anon_page,
+       sync_page: block_sync_page,
+} ;
+static struct address_space anon_space_mapping = {
+       pages: { &anon_space_mapping.pages, &anon_space_mapping.pages },
+       a_ops: &anon_space_ops,
+} ;
 
 /* This is used by some architectures to estimate available memory. */
 atomic_t buffermem_pages = ATOMIC_INIT(0);
@@ -161,6 +172,73 @@
        atomic_dec(&bh->b_count);
 }
 
+/* just for use with anon pages, or pages that don't provide their own
+** writepage func.  We just want to write bh, not the whole page, so we
+** queue that io here instead of calling writepage.
+*/
+static int __dirty_list_writepage(struct page *page, struct buffer_head *bh) {
+       int other_dirty = 0 ;
+       struct buffer_head *cur ;
+
+       /* check for other dirty buffers on this page.  If there are none,
+       ** clear the page dirty bit
+       */
+       cur = bh->b_this_page ;
+       while(cur != bh) {
+               other_dirty += buffer_dirty(cur) ;      
+               cur = cur->b_this_page ;
+       } 
+       if (other_dirty == 0) {
+               ClearPageDirty(page) ;
+       } 
+
+       /* we want the page available for locking again right away.  
+       ** someone walking the dirty buffer list might find another
+       ** buffer from this page, and we don't want them to skip it in
+       ** favor of a younger buffer.
+       */
+       atomic_inc(&bh->b_count) ;
+       ll_rw_block(WRITE, 1, &bh) ;
+       atomic_dec(&bh->b_count) ;
+       UnlockPage(page) ;
+       return 0 ;
+}
+
+/*
+** util function for sync_buffers and flush_dirty_buffers
+** uses either the writepage func supplied in the page's mapping,
+** or the anon address space writepage
+*/
+static int dirty_list_writepage(struct page *page, struct buffer_head *bh) {
+       int (*writepage)(struct page *)  ;
+       int ret ;
+
+       /* someone wrote this page while we were locking before */
+       if (!PageDirty(page) && !buffer_dirty(bh)) {
+               UnlockPage(page) ;
+               return 0 ;
+       }
+       writepage = page->mapping->a_ops->writepage ;
+
+       /* For anon pages, and pages that don't have a writepage
+       ** func, just write this one dirty buffer.  __dirty_list_writepage
+       ** does a little more work to make sure the page dirty bit is cleared
+       ** when we are the only dirty buffer on this page
+       */
+       if (!writepage || page->mapping == &anon_space_mapping) {
+               writepage = anon_space_ops.writepage ;
+               return __dirty_list_writepage(page, bh) ;
+       }
+
+       ClearPageDirty(page) ;
+       ret = writepage(page) ;
+       if (ret == 1) {
+               SetPageDirty(page) ;
+               UnlockPage(page) ;
+       }
+       return ret ;
+}
+
 /* Call sync_buffers with wait!=0 to ensure that the call does not
  * return until all buffer writes have completed.  Sync() may return
  * before the writes have finished; fsync() may not.
@@ -175,6 +253,7 @@
 {
        int i, retry, pass = 0, err = 0;
        struct buffer_head * bh, *next;
+       struct page *page ;
 
        /* One pass for no-wait, three for wait:
         * 0) write out all dirty, unlocked buffers;
@@ -230,10 +309,27 @@
                        if (!buffer_dirty(bh) || pass >= 2)
                                continue;
 
-                       atomic_inc(&bh->b_count);
+                       page = bh->b_page ;
+                       page_cache_get(page) ;
+                       if (TryLockPage(page)) {
+                               if (!wait || !pass) {
+                                       retry = 1 ;
+                                       continue ;
+                               }
+                               spin_unlock(&lru_list_lock);
+                               wait_on_page(page) ;
+                               page_cache_release(page) ;
+                               goto repeat ;
+                       }
                        spin_unlock(&lru_list_lock);
-                       ll_rw_block(WRITE, 1, &bh);
-                       atomic_dec(&bh->b_count);
+
+                       /* if the writepage func returns 1, it is 
+                       ** responsible for marking the buffers dirty
+                       ** again (or not marking them clean at all).
+                       ** we'll catch them again on the next pass
+                       */
+                       dirty_list_writepage(page, bh) ;
+                       page_cache_release(page) ;
                        retry = 1;
                        goto repeat;
                }
@@ -644,7 +740,7 @@
                        if (bh->b_dev != dev)
                                continue;
                        /* Part of a mapping? */
-                       if (bh->b_page->mapping)
+                       if (bh->b_page->mapping != &anon_space_mapping)
                                continue;
                        if (buffer_locked(bh)) {
                                atomic_inc(&bh->b_count);
@@ -852,13 +948,14 @@
 int fsync_inode_buffers(struct inode *inode)
 {
        struct buffer_head *bh;
-       struct inode tmp;
+       struct inode tmp ;
        int err = 0, err2;
+       struct page * page ;
+       int ret ;
        
        INIT_LIST_HEAD(&tmp.i_dirty_buffers);
        
        spin_lock(&lru_list_lock);
-
        while (!list_empty(&inode->i_dirty_buffers)) {
                bh = BH_ENTRY(inode->i_dirty_buffers.next);
                list_del(&bh->b_inode_buffers);
@@ -868,11 +965,28 @@
                        bh->b_inode = &tmp;
                        list_add(&bh->b_inode_buffers, &tmp.i_dirty_buffers);
                        if (buffer_dirty(bh)) {
-                               atomic_inc(&bh->b_count);
+                               page = bh->b_page ;
+                               page_cache_get(page) ;
                                spin_unlock(&lru_list_lock);
-                               ll_rw_block(WRITE, 1, &bh);
-                               brelse(bh);
+
+                               LockPage(page) ;
+                               ret = dirty_list_writepage(page, bh) ;
+                               page_cache_release(page) ;
+
                                spin_lock(&lru_list_lock);
+
+                               /* if the writepage func decided to skip
+                               ** this page, we have to put it back onto
+                               ** the dirty buffer list.  we add onto the 
+                               ** tail so this buffer will be retried after
+                               ** all the other writes have gone through.
+                               */
+                               if (ret == 1) {
+                                       list_del(&bh->b_inode_buffers) ;
+                                       list_add_tail(&bh->b_inode_buffers,
+                                                     &inode->i_dirty_buffers) ;
+                                       bh->b_inode = inode ;
+                               }
                        }
                }
        }
@@ -1101,8 +1215,10 @@
        int dispose = BUF_CLEAN;
        if (buffer_locked(bh))
                dispose = BUF_LOCKED;
-       if (buffer_dirty(bh))
+       if (buffer_dirty(bh)) {
                dispose = BUF_DIRTY;
+               SetPageDirty(bh->b_page) ;
+       }
        if (buffer_protected(bh))
                dispose = BUF_PROTECTED;
        if (dispose != bh->b_list) {
@@ -1478,6 +1594,53 @@
  * "Dirty" is valid only with the last case (mapped+uptodate).
  */
 
+static int block_write_anon_page(struct page *page) 
+{
+       struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+       int i, nr = 0 ;
+       int partial = 0 ;
+       int ret = 0 ;
+
+       if (!PageLocked(page))
+               BUG();
+
+       if (!page->buffers)
+               BUG() ;
+
+       head = page->buffers;
+       bh = head;
+
+       /* Stage 1: find the dirty buffers, lock them for submit_bh */
+       do {
+               if (!test_and_set_bit(BH_Lock, &bh->b_state)) {
+                       if (buffer_uptodate(bh) && buffer_dirty(bh)) {
+                               bh->b_end_io = end_buffer_io_async;
+                               clear_bit(BH_Dirty, &bh->b_state) ;
+                               atomic_inc(&bh->b_count);
+                               arr[nr++] = bh ;
+                       } else {
+                               partial = 1 ;
+                               unlock_buffer(bh) ;
+                       }
+               } else {
+                       partial = 1 ;
+               }
+               bh = bh->b_this_page;
+       } while (bh != head);
+
+       /* Stage 2: submit the IO */
+       for (i = 0 ; i < nr ; i++) {
+               submit_bh(WRITE, arr[i]) ;
+       }
+       /* Done - end_buffer_io_async will unlock */
+       if (!partial)
+               SetPageUptodate(page);
+       if (nr == 0) {
+               UnlockPage(page) ;
+       }
+       return ret ;
+}
+
 /*
  * block_write_full_page() is SMP-safe - currently it's still
  * being called with the kernel lock held, but the code is ready.
@@ -1487,6 +1650,10 @@
        int err, i;
        unsigned long block;
        struct buffer_head *bh, *head;
+       int nr = 0 ;
+       struct buffer_head *arr[MAX_BUF_PER_PAGE] ;
+       int page_ok = Page_Uptodate(page) ;
+       int partial = 0;
 
        if (!PageLocked(page))
                BUG();
@@ -1509,36 +1676,46 @@
                 *
                 * Leave it to the low-level FS to make all those
                 * decisions (block #0 may actually be a valid block)
+                *
+                * only bother when the page is up to date or the buffer
+                * is dirty.
                 */
-               if (!buffer_mapped(bh)) {
-                       err = get_block(inode, block, bh, 1);
-                       if (err)
-                               goto out;
-                       if (buffer_new(bh))
-                               unmap_underlying_metadata(bh);
+               if (page_ok || buffer_dirty(bh)) {
+                       if (!buffer_mapped(bh)) {
+                               err = get_block(inode, block, bh, 1);
+                               if (err)
+                                       goto out;
+                               if (buffer_new(bh))
+                                       unmap_underlying_metadata(bh);
+                       }
+                       arr[nr++] = bh ; 
+               } else {
+                       partial = 1 ;
                }
                bh = bh->b_this_page;
                block++;
        } while (bh != head);
 
        /* Stage 2: lock the buffers, mark them clean */
-       do {
+       for (i = 0 ; i < nr ; i++) {
+               bh = arr[i] ;
                lock_buffer(bh);
                bh->b_end_io = end_buffer_io_async;
                atomic_inc(&bh->b_count);
                set_bit(BH_Uptodate, &bh->b_state);
                clear_bit(BH_Dirty, &bh->b_state);
-               bh = bh->b_this_page;
-       } while (bh != head);
+       } 
 
-       /* Stage 3: submit the IO */
-       do {
-               submit_bh(WRITE, bh);
-               bh = bh->b_this_page;           
-       } while (bh != head);
+       for (i = 0 ; i < nr ; i++) {
+               submit_bh(WRITE, arr[i]) ;
+       }
+
+       if (nr == 0) 
+               UnlockPage(page) ;
 
        /* Done - end_buffer_io_async will unlock */
-       SetPageUptodate(page);
+       if (!partial)
+               SetPageUptodate(page);
        return 0;
 
 out:
@@ -1658,6 +1835,45 @@
 }
 
 /*
+** just sets the dirty bits for a range of buffers in the page.  Does
+** not balance the dirty list, or put the buffers onto the dirty list
+*/
+static int __block_dirty_range(struct inode *inode, struct page *page,
+               unsigned from, unsigned to)
+{
+       unsigned block_start, block_end;
+       int partial = 0 ;
+       unsigned blocksize;
+       struct buffer_head *bh, *head;
+
+       blocksize = inode->i_sb->s_blocksize;
+
+       for(bh = head = page->buffers, block_start = 0;
+           bh != head || !block_start;
+           block_start=block_end, bh = bh->b_this_page) {
+               block_end = block_start + blocksize;
+               if (block_end <= from || block_start >= to) {
+                       if (!buffer_uptodate(bh))
+                               partial = 1;
+               } else {
+                       set_bit(BH_Uptodate, &bh->b_state);
+                       if (!atomic_set_buffer_dirty(bh)) {
+                               buffer_insert_inode_queue(bh, inode);
+                       }
+               }
+       }
+       /*
+        * is this a partial write that happened to make all buffers
+        * uptodate then we can optimize away a bogus readpage() for
+        * the next read(). Here we 'discover' wether the page went
+        * uptodate as a result of this (potentially partial) write.
+        */
+       if (!partial)
+               SetPageUptodate(page);
+       return 0;
+}
+
+/*
  * Generic "read page" function for block devices that have the normal
  * get_block functionality. This is most of the block device filesystems.
  * Reads the page asynchronously --- the unlock_buffer() and
@@ -1947,13 +2163,23 @@
        if (!err) {
                memset(page_address(page) + offset, 0, PAGE_CACHE_SIZE - offset);
                flush_dcache_page(page);
-               __block_commit_write(inode,page,0,offset);
+
+               /* this will just set the dirty bits for block_write_full_page
+               ** it is only safe because we have the page locked and
+               ** nobody will try to write the buffers between
+               ** the block_dirty_range and the write_full_page calls
+               ** we have to clear the page up to date so the buffers
+               ** past the end of the file won't get written
+               */
+               __block_dirty_range(inode,page,0,offset);
+               ClearPageUptodate(page);
+               err = __block_write_full_page(inode, page, get_block) ;
 done:
                kunmap(page);
-               UnlockPage(page);
                return err;
        }
        ClearPageUptodate(page);
+       UnlockPage(page);
        goto done;
 }
 
@@ -2244,7 +2470,7 @@
        struct buffer_head *bh, *tmp;
        struct buffer_head * insert_point;
        int isize;
-
+       unsigned long index ;
        if ((size & 511) || (size > PAGE_SIZE)) {
                printk("VFS: grow_buffers: size = %d\n",size);
                return 0;
@@ -2260,6 +2486,16 @@
 
        isize = BUFSIZE_INDEX(size);
 
+       /* don't put this buffer head on the free list until the
+       ** page is setup.  Is there a better index to use?  Would 0
+       ** be good enough?
+       */
+       page->flags &= ~(1 << PG_referenced);
+       index = atomic_read(&buffermem_pages) ;
+       atomic_inc(&buffermem_pages);
+       add_to_page_cache_locked(page, &anon_space_mapping, index) ;
+       page->buffers = bh;
+
        spin_lock(&free_list[isize].lock);
        insert_point = free_list[isize].list;
        tmp = bh;
@@ -2283,11 +2519,7 @@
        free_list[isize].list = bh;
        spin_unlock(&free_list[isize].lock);
 
-       page->buffers = bh;
-       page->flags &= ~(1 << PG_referenced);
-       lru_cache_add(page);
        UnlockPage(page);
-       atomic_inc(&buffermem_pages);
        return 1;
 
 no_buffer_head:
@@ -2309,7 +2541,6 @@
  *
  * Wait:
  *     0 - no wait (this does not get called - see try_to_free_buffers below)
- *     1 - start IO for dirty buffers
  *     2 - wait for completion of locked buffers
  */
 static void sync_page_buffers(struct buffer_head *bh, int wait)
@@ -2319,11 +2550,9 @@
        do {
                struct buffer_head *p = tmp;
                tmp = tmp->b_this_page;
-               if (buffer_locked(p)) {
-                       if (wait > 1)
-                               __wait_on_buffer(p);
-               } else if (buffer_dirty(p))
-                       ll_rw_block(WRITE, 1, &p);
+               if (buffer_locked(p) && wait > 1) {
+                       __wait_on_buffer(p);
+               } 
        } while (tmp != bh);
 }
 
@@ -2386,6 +2615,9 @@
 
        /* And free the page */
        page->buffers = NULL;
+       if (page->mapping == &anon_space_mapping) {
+               atomic_dec(&buffermem_pages) ;
+       }
        page_cache_release(page);
        spin_unlock(&free_list[index].lock);
        write_unlock(&hash_table_lock);
@@ -2564,6 +2796,7 @@
 static int flush_dirty_buffers(int check_flushtime)
 {
        struct buffer_head * bh, *next;
+       struct page *page ;
        int flushed = 0, i;
 
  restart:
@@ -2580,6 +2813,8 @@
                }
                if (buffer_locked(bh))
                        continue;
+               if (!buffer_uptodate(bh))
+                       continue ;
 
                if (check_flushtime) {
                        /* The dirty lru list is chronologically ordered so
@@ -2592,13 +2827,15 @@
                        if (++flushed > bdf_prm.b_un.ndirty)
                                goto out_unlock;
                }
-
-               /* OK, now we are committed to write it out. */
-               atomic_inc(&bh->b_count);
-               spin_unlock(&lru_list_lock);
-               ll_rw_block(WRITE, 1, &bh);
-               atomic_dec(&bh->b_count);
-
+               page = bh->b_page ;
+               page_cache_get(page) ;
+               if (TryLockPage(page)) {
+                       page_cache_release(page) ;
+                       continue ;
+               }
+               spin_unlock(&lru_list_lock) ;
+               dirty_list_writepage(page, bh) ;
+               page_cache_release(page) ;
                if (current->need_resched)
                        schedule();
                goto restart;
diff -urN linux-test13-pre4/mm/page_alloc.c linux-anon-space/mm/page_alloc.c
--- linux-test13-pre4/mm/page_alloc.c   Tue Nov 28 13:54:31 2000
+++ linux-anon-space/mm/page_alloc.c    Sun Dec 24 19:00:31 2000
@@ -317,11 +317,12 @@
        /*
         * If we are about to get low on free pages and cleaning
         * the inactive_dirty pages would fix the situation,
-        * wake up bdflush.
+        * wake up kswapd here as well, so page_launder can start
+        * sending things to disk.
         */
        else if (free_shortage() && nr_inactive_dirty_pages > free_shortage()
                        && nr_inactive_dirty_pages >= freepages.high)
-               wakeup_bdflush(0);
+               wakeup_kswapd(0);
 
 try_again:
        /*
diff -urN linux-test13-pre4/mm/vmscan.c linux-anon-space/mm/vmscan.c
--- linux-test13-pre4/mm/vmscan.c       Sat Dec 23 13:14:26 2000
+++ linux-anon-space/mm/vmscan.c        Tue Dec 26 00:52:32 2000
@@ -678,7 +678,6 @@
 
                        /* The page was only in the buffer cache. */
                        } else if (!page->mapping) {
-                               atomic_dec(&buffermem_pages);
                                freed_page = 1;
                                cleaned_pages++;
 
@@ -701,10 +700,9 @@
                        page_cache_release(page);
 
                        /* 
-                        * If we're freeing buffer cache pages, stop when
-                        * we've got enough free memory.
+                        * stop when we've got enough free memory.
                         */
-                       if (freed_page && !free_shortage())
+                       if (!free_shortage())
                                break;
                        continue;
                } else if (page->mapping && !PageDirty(page)) {
@@ -739,9 +737,6 @@
         * free anything yet, we wait synchronously on the writeout of
         * MAX_SYNC_LAUNDER pages.
         *
-        * We also wake up bdflush, since bdflush should, under most
-        * loads, flush out the dirty pages before we have to wait on
-        * IO.
         */
        if (can_get_io_locks && !launder_loop && free_shortage()) {
                launder_loop = 1;
@@ -750,8 +745,6 @@
                        sync = 0;
                /* We only do a few "out of order" flushes. */
                maxlaunder = MAX_LAUNDER;
-               /* Kflushd takes care of the rest. */
-               wakeup_bdflush(0);
                goto dirty_page_rescan;
        }
 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC] changes to buffer.c (was Test12 ll_rw_block error)

Reply via email to