Re: [Cluster-devel] [PATCH][try6] VFS: new want_holesize and got_holesize buffer_head flags for fiemap

Dave Chinner Sun, 26 Oct 2014 14:42:49 -0700

On Thu, Oct 23, 2014 at 08:34:58AM -0400, Bob Peterson wrote:
> ----- Original Message -----
> > On Wed, Oct 22, 2014 at 08:28:53AM -0400, Bob Peterson wrote:
> > > Yes, I thought about that.
> > > One of my early prototypes had a separate function used by fiemap.
> > > Function __generic_block_fiemap would call get_block() which
> > > returned an indication of a hole as it does today. When it saw
> > > the hole, fiemap called a new function get_hole_size() that was
> > > passed in like get_block. The problem is: it's grossly inefficient,
> > > since the new function get_hole_size() has to redo most of the work
> > > that get_block just did (at least in the case of GFS2). (Which in the
> > > case of a 1PB sparse file is non-trivial, since it involves several
> > > levels of metadata indirection). Combining it with get_block made it
> > > much more efficient.
> > > 
> > > Making a separate get_block_map_fiemap() function just seems like an
> > > exercise in redundancy.
> > 
> > I was thinking of replacing get_blocks entirely.  We're not actually
> > using a buffer_head in fiemap, so the interface seems somewhat awkward.
> > If it used something like the iomap interface proposed by Dave long
> > time ago we'd have a much saner interface that for example XFS could use
> > as well.
> 
> Hi Christoph. Can you send a link to the thread regarding Dave's iomap?
> proposal? I don't recall it offhand, so I don't know what it was or
> why it was never implemented. I assume you mean Dave Chinner. Maybe it's
> time to revisit the concept as a long-term solution.


Old patch I had is below.

This is actually on my radar again because I want to get rid of
buffer heads in XFS for various reasons and this is one the
interfaces I need to make that possible....

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com

multipage-write: introduce iomap infrastructure

From: Dave Chinner <dchin...@redhat.com>

Add infrastructure for multipage writes by defining the mapping interface
that the multipage writes will use and the main multipage write loop.

Signed-off-by: Dave Chinner <dchin...@redhat.com>

diff --git a/include/linux/fs.h b/include/linux/fs.h
index 76041b6..1196877 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -513,6 +513,7 @@ enum positive_aop_returns {
 struct page;
 struct address_space;
 struct writeback_control;
+struct iomap;
 
 struct iov_iter {
        const struct iovec *iov;
@@ -604,6 +605,9 @@ struct address_space_operations {
        int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
                                        unsigned long);
        int (*error_remove_page)(struct address_space *, struct page *);
+
+       int (*iomap)(struct address_space *mapping, loff_t pos,
+                       ssize_t length, struct iomap *iomap, int cmd);
 };
 
 /*
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
new file mode 100644
index 0000000..7708614
--- /dev/null
+++ b/include/linux/iomap.h
@@ -0,0 +1,45 @@
+#ifndef _IOMAP_H
+#define _IOMAP_H
+
+/* ->iomap a_op command types */
+#define IOMAP_READ     0x01    /* read the current mapping starting at the
+                                  given position, trimmed to a maximum length.
+                                  FS's should use this to obtain and lock
+                                  resources within this range */
+#define        IOMAP_RESERVE   0x02    /* reserve space for an allocation that 
spans
+                                  the given iomap */
+#define IOMAP_ALLOCATE 0x03    /* allocate space in a given iomap - must have
+                                  first been reserved */
+#define        IOMAP_UNRESERVE 0x04    /* return unused reserved space for the 
given
+                                  iomap and used space. This will always be
+                                  called after a IOMAP_READ so as to allow the
+                                  FS to release held resources. */
+
+/* types of block ranges for multipage write mappings. */
+#define IOMAP_HOLE     0x01    /* no blocks allocated, need allocation */
+#define IOMAP_DELALLOC 0x02    /* delayed allocation blocks */
+#define IOMAP_MAPPED   0x03    /* blocks allocated @blkno */
+#define IOMAP_UNWRITTEN        0x04    /* blocks allocated @blkno in unwritten 
state */
+
+struct iomap {
+       sector_t        blkno;  /* first sector of mapping */
+       loff_t          offset; /* file offset of mapping, bytes */
+       ssize_t         length; /* length of mapping, bytes */
+       int             type;   /* type of mapping */
+       void            *priv;  /* fs private data associated with map */
+};
+
+static inline bool
+iomap_needs_allocation(struct iomap *iomap)
+{
+       return iomap->type == IOMAP_HOLE;
+}
+
+/* multipage write interfaces use iomaps */
+typedef int (*mpw_actor_t)(struct address_space *mapping, void *src,
+                       loff_t pos, ssize_t len, struct iomap *iomap);
+
+ssize_t multipage_write_segment(struct address_space *mapping, void *src,
+                       loff_t pos, ssize_t length, mpw_actor_t actor);
+
+#endif /* _IOMAP_H */
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d4df44..27e2f7d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/compiler.h>
 #include <linux/fs.h>
+#include <linux/iomap.h>
 #include <linux/uaccess.h>
 #include <linux/aio.h>
 #include <linux/capability.h>
@@ -2221,10 +2222,14 @@ repeat:
 }
 EXPORT_SYMBOL(grab_cache_page_write_begin);
 
-static ssize_t generic_perform_write(struct file *file,
-                               struct iov_iter *i, loff_t pos)
+static ssize_t
+__generic_perform_write(
+       struct file             *file,
+       struct address_space    *mapping,
+       struct iov_iter         *i,
+       loff_t                  pos,
+       void                    *priv)
 {
-       struct address_space *mapping = file->f_mapping;
        const struct address_space_operations *a_ops = mapping->a_ops;
        long status = 0;
        ssize_t written = 0;
@@ -2241,7 +2246,6 @@ static ssize_t generic_perform_write(struct file *file,
                unsigned long offset;   /* Offset into pagecache page */
                unsigned long bytes;    /* Bytes to write to page */
                size_t copied;          /* Bytes copied from user */
-               void *fsdata;
 
                offset = (pos & (PAGE_CACHE_SIZE - 1));
                bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
@@ -2265,7 +2269,7 @@ again:
                }
 
                status = a_ops->write_begin(file, mapping, pos, bytes, flags,
-                                               &page, &fsdata);
+                                               &page, priv);
                if (unlikely(status))
                        break;
 
@@ -2279,7 +2283,7 @@ again:
 
                mark_page_accessed(page);
                status = a_ops->write_end(file, mapping, pos, bytes, copied,
-                                               page, fsdata);
+                                               page, priv);
                if (unlikely(status < 0))
                        break;
                copied = status;
@@ -2310,6 +2314,159 @@ again:
        return written ? written : status;
 }
 
+static int
+multipage_write_actor(
+       struct address_space    *mapping,
+       void                    *src,
+       loff_t                  pos,
+       ssize_t                 length,
+       struct iomap            *iomap)
+{
+       struct iov_iter         *i = src;
+
+       return __generic_perform_write(NULL, mapping, i, pos, iomap);
+}
+
+/*
+ * Execute a multipage write on a segment of the mapping that spans a
+ * contiguous range of pages that have identical block mapping state.
+ * This avoids the need to map pages individually, do individual allocations
+ * for each page and most importantly avoid the need for filesystem specific
+ * locking per page. Instead, all the operations are amortised over the entire
+ * range of pages. It is assumed that the filesystems will lock whatever
+ * resources they require in the IOMAP_READ call, and release them in the
+ * IOMAP_COMPLETE call.
+ */
+ssize_t
+multipage_write_segment(
+       struct address_space    *mapping,
+       void                    *src,
+       loff_t                  pos,
+       ssize_t                 length,
+       mpw_actor_t             actor)
+{
+       const struct address_space_operations *a_ops = mapping->a_ops;
+       long                    status;
+       bool                    need_alloc;
+       struct iomap            iomap = { 0 };
+
+       /*
+        * need to map a range from start position for count bytes. This can
+        * span multiple pages - it is only guaranteed to return a range of a
+        * single type of pages (e.g. all into a hole, all mapped or all
+        * unwritten). Failure at this point has nothing to undo.
+        *
+        * We cap the maximum length we map here to MAX_WRITEBACK_PAGES pages
+        * to keep the chunks of work done where somewhat symmetric with the
+        * work writeback does. This is a completely arbitrary number pulled
+        * out of thin air as a best guess for initial testing.
+        */
+       length = min_t(ssize_t, length, 1024 * PAGE_SIZE);
+       status = a_ops->iomap(mapping, pos, length, &iomap, IOMAP_READ);
+       if (status)
+               goto out;
+
+       /*
+        * If allocation is required for this range, reserve the space now so
+        * that the allocation is guaranteed to succeed later on. Once we copy
+        * the data into the page cache pages, then we cannot fail otherwise we
+        * expose transient stale data. If the reserve fails, we can safely
+        * back out at this point as there is nothing to undo.
+        */
+       need_alloc = iomap_needs_allocation(&iomap);
+       if (need_alloc) {
+               status = a_ops->iomap(mapping, pos, length, &iomap, 
IOMAP_RESERVE);
+               if (status)
+                       goto out;
+       }
+
+       /*
+        * because we have now guaranteed that the space allocation will
+        * succeed, we can do the copy-in page by page without having to worry
+        * about failures exposing transient data. Hence we can mostly reuse
+        * the existing method of:
+        *      - grab and lock page
+        *      - set up page mapping structures (e.g. bufferheads)
+        *      - copy data in
+        *      - update page state and unlock page
+        *
+        * This avoids the need to hold MAX_WRITEBACK_PAGES locked at once
+        * while we execute the copy-in. It does mean, however, that the
+        * filesystem needs to avoid any attempts at writeback of pages in this
+        * iomap until the allocation is completed after the copyin.
+        *
+        * XXX: needs to be done per-filesystem in ->writepage
+        */
+
+       status = actor(mapping, src, pos, length, &iomap);
+       printk("mpws: pos %lld, len %lld, status %lld\n", pos, length, status);
+       if (status == -ERANGE)
+               status = 0;
+       if (status <= 0)
+               goto out_failed_write;
+
+       /* now the data has been copied, allocate the range we've copied */
+       if (need_alloc) {
+               int error;
+               /*
+                * allocate should not fail unless the filesystem has had a
+                * fatal error. Issue a warning here to track this situation.
+                */
+               error = a_ops->iomap(mapping, pos, status, &iomap, 
IOMAP_ALLOCATE);
+               if (error) {
+                       WARN_ON(0);
+                       status = error;
+                       /* XXX: mark all pages not-up-to-date? */
+                       goto out_failed_write;
+               }
+       }
+
+
+out:
+       return status;
+       /*
+        * if we copied less than we reserved, release any unused
+        * reservation we hold so that we don't leak the space. Unreserving
+        * space never fails.
+        */
+out_failed_write:
+       if (need_alloc)
+               a_ops->iomap(mapping, pos, 0, &iomap, IOMAP_UNRESERVE);
+       return status;
+}
+EXPORT_SYMBOL(multipage_write_segment);
+
+/*
+ * Loop writing multiple pages in segments until we have run out
+ * of data to write in the iovec iterator.
+ */
+static ssize_t
+generic_perform_multipage_write(struct file *file,
+                               struct iov_iter *i, loff_t pos)
+{
+       long status = 0;
+       ssize_t written = 0;
+
+       do {
+               status = multipage_write_segment(file->f_mapping, i, pos,
+                               iov_iter_count(i), multipage_write_actor);
+               if (status <= 0)
+                       break;
+               pos += status;
+               written += status;
+       } while (iov_iter_count(i));
+
+       return written ? written : status;
+}
+
+static ssize_t generic_perform_write(struct file *file,
+                               struct iov_iter *i, loff_t pos)
+{
+       void *fs_data;
+
+       return __generic_perform_write(file, file->f_mapping, i, pos, &fs_data);
+}
+
 ssize_t
 generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos, loff_t *ppos,
@@ -2320,13 +2477,17 @@ generic_file_buffered_write(struct kiocb *iocb, const 
struct iovec *iov,
        struct iov_iter i;
 
        iov_iter_init(&i, iov, nr_segs, count, written);
-       status = generic_perform_write(file, &i, pos);
+
+       if (file->f_mapping->a_ops->iomap)
+               status = generic_perform_multipage_write(file, &i, pos);
+       else
+               status = generic_perform_write(file, &i, pos);
 
        if (likely(status >= 0)) {
                written += status;
                *ppos = pos + status;
-       }
-       
+       }
+
        return written ? written : status;
 }
 EXPORT_SYMBOL(generic_file_buffered_write);

Re: [Cluster-devel] [PATCH][try6] VFS: new want_holesize and got_holesize buffer_head flags for fiemap

Reply via email to