Support unbuffered and direct I/O writes to an encrypted file.  This may
require making an RMW cycle if the write is not appropriately aligned with
respect to the crypto blocks.

Signed-off-by: David Howells <dhowe...@redhat.com>
cc: Jeff Layton <jlay...@kernel.org>
cc: linux-cachefs@redhat.com
cc: linux-fsde...@vger.kernel.org
cc: linux...@kvack.org
---
 fs/netfs/direct_read.c       |   2 +-
 fs/netfs/direct_write.c      | 210 ++++++++++++++++++++++++++++++++++-
 fs/netfs/internal.h          |   8 ++
 fs/netfs/io.c                | 117 +++++++++++++++++++
 fs/netfs/main.c              |   1 +
 include/linux/netfs.h        |   4 +
 include/trace/events/netfs.h |   1 +
 7 files changed, 337 insertions(+), 6 deletions(-)

diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
index 158719b56900..c01cbe42db8a 100644
--- a/fs/netfs/direct_read.c
+++ b/fs/netfs/direct_read.c
@@ -88,7 +88,7 @@ static int netfs_copy_xarray_to_iter(struct netfs_io_request 
*rreq,
  * If we did a direct read to a bounce buffer (say we needed to decrypt it),
  * copy the data obtained to the destination iterator.
  */
-static int netfs_dio_copy_bounce_to_dest(struct netfs_io_request *rreq)
+int netfs_dio_copy_bounce_to_dest(struct netfs_io_request *rreq)
 {
        struct iov_iter *dest_iter = &rreq->iter;
        struct kiocb *iocb = rreq->iocb;
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
index b1a4921ac4a2..f9dea801d6dd 100644
--- a/fs/netfs/direct_write.c
+++ b/fs/netfs/direct_write.c
@@ -23,6 +23,100 @@ static void netfs_cleanup_dio_write(struct netfs_io_request 
*wreq)
        }
 }
 
+/*
+ * Allocate a bunch of pages and add them into the xarray buffer starting at
+ * the given index.
+ */
+static int netfs_alloc_buffer(struct xarray *xa, pgoff_t index, unsigned int 
nr_pages)
+{
+       struct page *page;
+       unsigned int n;
+       int ret = 0;
+       LIST_HEAD(list);
+
+       n = alloc_pages_bulk_list(GFP_NOIO, nr_pages, &list);
+       if (n < nr_pages) {
+               ret = -ENOMEM;
+       }
+
+       while ((page = list_first_entry_or_null(&list, struct page, lru))) {
+               list_del(&page->lru);
+               page->index = index;
+               ret = xa_insert(xa, index++, page, GFP_NOIO);
+               if (ret < 0)
+                       break;
+       }
+
+       while ((page = list_first_entry_or_null(&list, struct page, lru))) {
+               list_del(&page->lru);
+               __free_page(page);
+       }
+       return ret;
+}
+
+/*
+ * Copy all of the data from the source iterator into folios in the destination
+ * xarray.  We cannot step through and kmap the source iterator if it's an
+ * iovec, so we have to step through the xarray and drop the RCU lock each
+ * time.
+ */
+static int netfs_copy_iter_to_xarray(struct iov_iter *src, struct xarray *xa,
+                                    unsigned long long start)
+{
+       struct folio *folio;
+       void *base;
+       pgoff_t index = start / PAGE_SIZE;
+       size_t len, copied, count = iov_iter_count(src);
+
+       XA_STATE(xas, xa, index);
+
+       _enter("%zx", count);
+
+       if (!count)
+               return -EIO;
+
+       len = PAGE_SIZE - offset_in_page(start);
+       rcu_read_lock();
+       xas_for_each(&xas, folio, ULONG_MAX) {
+               size_t offset;
+
+               if (xas_retry(&xas, folio))
+                       continue;
+
+               /* There shouldn't be a need to call xas_pause() as no one else
+                * can see the xarray we're iterating over.
+                */
+               rcu_read_unlock();
+
+               offset = offset_in_folio(folio, start);
+               _debug("folio %lx +%zx [%llx]", folio->index, offset, start);
+
+               while (offset < folio_size(folio)) {
+                       len = min(count, len);
+
+                       base = kmap_local_folio(folio, offset);
+                       copied = copy_from_iter(base, len, src);
+                       kunmap_local(base);
+                       if (copied != len)
+                               goto out;
+                       count -= len;
+                       if (count == 0)
+                               goto out;
+
+                       start += len;
+                       offset += len;
+                       len = PAGE_SIZE;
+               }
+
+               rcu_read_lock();
+       }
+
+       rcu_read_unlock();
+out:
+       _leave(" = %zx", count);
+       return count ? -EIO : 0;
+}
+
 /*
  * Perform an unbuffered write where we may have to do an RMW operation on an
  * encrypted file.  This can also be used for direct I/O writes.
@@ -31,20 +125,47 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb 
*iocb, struct iov_iter *
                                           struct netfs_group *netfs_group)
 {
        struct netfs_io_request *wreq;
+       struct netfs_inode *ctx = netfs_inode(file_inode(iocb->ki_filp));
+       unsigned long long real_size = ctx->remote_i_size;
        unsigned long long start = iocb->ki_pos;
        unsigned long long end = start + iov_iter_count(iter);
        ssize_t ret, n;
-       bool async = !is_sync_kiocb(iocb);
+       size_t min_bsize = 1UL << ctx->min_bshift;
+       size_t bmask = min_bsize - 1;
+       size_t gap_before = start & bmask;
+       size_t gap_after = (min_bsize - end) & bmask;
+       bool use_bounce, async = !is_sync_kiocb(iocb);
+       enum {
+               DIRECT_IO, COPY_TO_BOUNCE, ENC_TO_BOUNCE, COPY_THEN_ENC,
+       } buffering;
 
        _enter("");
 
+       /* The real size must be rounded out to the crypto block size plus
+        * any trailer we might want to attach.
+        */
+       if (real_size && ctx->crypto_bshift) {
+               size_t cmask = 1UL << ctx->crypto_bshift;
+
+               if (real_size < ctx->crypto_trailer)
+                       return -EIO;
+               if ((real_size - ctx->crypto_trailer) & cmask)
+                       return -EIO;
+               real_size -= ctx->crypto_trailer;
+       }
+
        /* We're going to need a bounce buffer if what we transmit is going to
         * be different in some way to the source buffer, e.g. because it gets
         * encrypted/compressed or because it needs expanding to a block size.
         */
-       // TODO
+       use_bounce = test_bit(NETFS_ICTX_ENCRYPTED, &ctx->flags);
+       if (gap_before || gap_after) {
+               if (iocb->ki_flags & IOCB_DIRECT)
+                       return -EINVAL;
+               use_bounce = true;
+       }
 
-       _debug("uw %llx-%llx", start, end);
+       _debug("uw %llx-%llx +%zx,%zx", start, end, gap_before, gap_after);
 
        wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
                                   start, end - start,
@@ -53,7 +174,57 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb 
*iocb, struct iov_iter *
        if (IS_ERR(wreq))
                return PTR_ERR(wreq);
 
-       {
+       if (use_bounce) {
+               unsigned long long bstart = start - gap_before;
+               unsigned long long bend = end + gap_after;
+               pgoff_t first = bstart / PAGE_SIZE;
+               pgoff_t last  = (bend - 1) / PAGE_SIZE;
+
+               _debug("bounce %llx-%llx %lx-%lx", bstart, bend, first, last);
+
+               ret = netfs_alloc_buffer(&wreq->bounce, first, last - first + 
1);
+               if (ret < 0)
+                       goto out;
+
+               iov_iter_xarray(&wreq->io_iter, READ, &wreq->bounce,
+                               bstart, bend - bstart);
+
+               if (gap_before || gap_after)
+                       async = false; /* We may have to repeat the RMW cycle */
+       }
+
+repeat_rmw_cycle:
+       if (use_bounce) {
+               /* If we're going to need to do an RMW cycle, fill in the gaps
+                * at the ends of the buffer.
+                */
+               if (gap_before || gap_after) {
+                       struct iov_iter buffer = wreq->io_iter;
+
+                       if ((gap_before && start - gap_before < real_size) ||
+                           (gap_after && end < real_size)) {
+                               ret = netfs_rmw_read(wreq, iocb->ki_filp,
+                                                    start - gap_before, 
gap_before,
+                                                    end, end < real_size ? 
gap_after : 0);
+                               if (ret < 0)
+                                       goto out;
+                       }
+
+                       if (gap_before && start - gap_before >= real_size)
+                               iov_iter_zero(gap_before, &buffer);
+                       if (gap_after && end >= real_size) {
+                               iov_iter_advance(&buffer, end - start);
+                               iov_iter_zero(gap_after, &buffer);
+                       }
+               }
+
+               if (!test_bit(NETFS_RREQ_CONTENT_ENCRYPTION, &wreq->flags))
+                       buffering = COPY_TO_BOUNCE;
+               else if (!gap_before && !gap_after && 
netfs_is_crypto_aligned(wreq, iter))
+                       buffering = ENC_TO_BOUNCE;
+               else
+                       buffering = COPY_THEN_ENC;
+       } else {
                /* If this is an async op and we're not using a bounce buffer,
                 * we have to save the source buffer as the iterator is only
                 * good until we return.  In such a case, extract an iterator
@@ -77,10 +248,25 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb 
*iocb, struct iov_iter *
                }
 
                wreq->io_iter = wreq->iter;
+               buffering = DIRECT_IO;
        }
 
        /* Copy the data into the bounce buffer and encrypt it. */
-       // TODO
+       if (buffering == COPY_TO_BOUNCE ||
+           buffering == COPY_THEN_ENC) {
+               ret = netfs_copy_iter_to_xarray(iter, &wreq->bounce, 
wreq->start);
+               if (ret < 0)
+                       goto out;
+               wreq->iter = wreq->io_iter;
+               wreq->start -= gap_before;
+               wreq->len += gap_before + gap_after;
+       }
+
+       if (buffering == COPY_THEN_ENC ||
+           buffering == ENC_TO_BOUNCE) {
+               if (!netfs_encrypt(wreq))
+                       goto out;
+       }
 
        /* Dispatch the write. */
        __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
@@ -101,6 +287,20 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb 
*iocb, struct iov_iter *
                wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
                            TASK_UNINTERRUPTIBLE);
 
+               /* See if the write failed due to a 3rd party race when doing
+                * an RMW on a partially modified block in an encrypted file.
+                */
+               if (test_and_clear_bit(NETFS_RREQ_REPEAT_RMW, &wreq->flags)) {
+                       netfs_clear_subrequests(wreq, false);
+                       iov_iter_revert(iter, end - start);
+                       wreq->error = 0;
+                       wreq->start = start;
+                       wreq->len = end - start;
+                       wreq->transferred = 0;
+                       wreq->submitted = 0;
+                       goto repeat_rmw_cycle;
+               }
+
                ret = wreq->error;
                _debug("waited = %zd", ret);
                if (ret == 0) {
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 7dd37d3aff3f..a25adbe7ec72 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -28,6 +28,11 @@ int netfs_prefetch_for_write(struct file *file, struct folio 
*folio,
 bool netfs_encrypt(struct netfs_io_request *wreq);
 void netfs_decrypt(struct netfs_io_request *rreq);
 
+/*
+ * direct_read.c
+ */
+int netfs_dio_copy_bounce_to_dest(struct netfs_io_request *rreq);
+
 /*
  * direct_write.c
  */
@@ -38,6 +43,9 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb 
*iocb, struct iov_iter *
  * io.c
  */
 int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
+ssize_t netfs_rmw_read(struct netfs_io_request *wreq, struct file *file,
+                      unsigned long long start1, size_t len1,
+                      unsigned long long start2, size_t len2);
 
 /*
  * main.c
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 9887b22e4cb3..14a9f3312d3b 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -775,3 +775,120 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool 
sync)
 out:
        return ret;
 }
+
+static bool netfs_rmw_read_one(struct netfs_io_request *rreq,
+                              unsigned long long start, size_t len)
+{
+       struct netfs_inode *ctx = netfs_inode(rreq->inode);
+       struct iov_iter io_iter;
+       unsigned long long pstart, end = start + len;
+       pgoff_t first, last;
+       ssize_t ret;
+       size_t min_bsize = 1UL << ctx->min_bshift;
+
+       /* Determine the block we need to load. */
+       end = round_up(end, min_bsize);
+       start = round_down(start, min_bsize);
+
+       /* Determine the folios we need to insert. */
+       pstart = round_down(start, PAGE_SIZE);
+       first = pstart / PAGE_SIZE;
+       last = DIV_ROUND_UP(end, PAGE_SIZE);
+
+       ret = netfs_add_folios_to_buffer(&rreq->bounce, rreq->mapping,
+                                        first, last, GFP_NOFS);
+       if (ret < 0) {
+               rreq->error = ret;
+               return false;
+       }
+
+       rreq->start = start;
+       rreq->len = len;
+       rreq->submitted = 0;
+       iov_iter_xarray(&rreq->io_iter, ITER_DEST, &rreq->bounce, start, len);
+
+       io_iter = rreq->io_iter;
+       do {
+               _debug("submit %llx + %zx >= %llx",
+                      rreq->start, rreq->submitted, rreq->i_size);
+               if (rreq->start + rreq->submitted >= rreq->i_size)
+                       break;
+               if (!netfs_rreq_submit_slice(rreq, &io_iter, 
&rreq->subreq_counter))
+                       break;
+       } while (rreq->submitted < rreq->len);
+
+       if (rreq->submitted < rreq->len) {
+               netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+               return false;
+       }
+
+       return true;
+}
+
+/*
+ * Begin the process of reading in one or two chunks of data for use by
+ * unbuffered write to perform an RMW cycle.  We don't read directly into the
+ * write buffer as this may get called to redo the read in the case that a
+ * conditional write fails due to conflicting 3rd-party modifications.
+ */
+ssize_t netfs_rmw_read(struct netfs_io_request *wreq, struct file *file,
+                      unsigned long long start1, size_t len1,
+                      unsigned long long start2, size_t len2)
+{
+       struct netfs_io_request *rreq;
+       ssize_t ret;
+
+       _enter("RMW:R=%x %llx-%llx %llx-%llx",
+              rreq->debug_id, start1, start1 + len1 - 1, start2, start2 + len2 
- 1);
+
+       rreq = netfs_alloc_request(wreq->mapping, file,
+                                  start1, start2 - start1 + len2, 
NETFS_RMW_READ);
+       if (IS_ERR(rreq))
+               return PTR_ERR(rreq);
+
+       INIT_WORK(&rreq->work, netfs_rreq_work);
+
+       rreq->iter = wreq->io_iter;
+       __set_bit(NETFS_RREQ_CRYPT_IN_PLACE, &rreq->flags);
+       __set_bit(NETFS_RREQ_USE_BOUNCE_BUFFER, &rreq->flags);
+
+       /* Chop the reads into slices according to what the netfs wants and
+        * submit each one.
+        */
+       netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
+       atomic_set(&rreq->nr_outstanding, 1);
+       if (len1 && !netfs_rmw_read_one(rreq, start1, len1))
+               goto wait;
+       if (len2)
+               netfs_rmw_read_one(rreq, start2, len2);
+
+wait:
+       /* Keep nr_outstanding incremented so that the ref always belongs to us
+        * and the service code isn't punted off to a random thread pool to
+        * process.
+        */
+       for (;;) {
+               wait_var_event(&rreq->nr_outstanding,
+                              atomic_read(&rreq->nr_outstanding) == 1);
+               netfs_rreq_assess(rreq, false);
+               if (atomic_read(&rreq->nr_outstanding) == 1)
+                       break;
+               cond_resched();
+       }
+
+       trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
+       wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+                   TASK_UNINTERRUPTIBLE);
+
+       ret = rreq->error;
+       if (ret == 0 && rreq->submitted < rreq->len) {
+               trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
+               ret = -EIO;
+       }
+
+       if (ret == 0)
+               ret = netfs_dio_copy_bounce_to_dest(rreq);
+
+       netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+       return ret;
+}
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 1cf10f9c4c1f..b335e6a50f9c 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -33,6 +33,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = {
        [NETFS_READPAGE]                = "RP",
        [NETFS_READ_FOR_WRITE]          = "RW",
        [NETFS_WRITEBACK]               = "WB",
+       [NETFS_RMW_READ]                = "RM",
        [NETFS_UNBUFFERED_WRITE]        = "UW",
        [NETFS_DIO_READ]                = "DR",
        [NETFS_DIO_WRITE]               = "DW",
diff --git a/include/linux/netfs.h b/include/linux/netfs.h
index 524e6f5ff3fd..9661ae24120f 100644
--- a/include/linux/netfs.h
+++ b/include/linux/netfs.h
@@ -145,6 +145,7 @@ struct netfs_inode {
 #define NETFS_ICTX_ENCRYPTED   2               /* The file contents are 
encrypted */
        unsigned char           min_bshift;     /* log2 min block size for 
bounding box or 0 */
        unsigned char           crypto_bshift;  /* log2 of crypto block size */
+       unsigned char           crypto_trailer; /* Size of crypto trailer */
 };
 
 /*
@@ -233,6 +234,7 @@ enum netfs_io_origin {
        NETFS_READPAGE,                 /* This read is a synchronous read */
        NETFS_READ_FOR_WRITE,           /* This read is to prepare a write */
        NETFS_WRITEBACK,                /* This write was triggered by 
writepages */
+       NETFS_RMW_READ,                 /* This is an unbuffered read for RMW */
        NETFS_UNBUFFERED_WRITE,         /* This is an unbuffered write */
        NETFS_DIO_READ,                 /* This is a direct I/O read */
        NETFS_DIO_WRITE,                /* This is a direct I/O write */
@@ -290,6 +292,7 @@ struct netfs_io_request {
 #define NETFS_RREQ_UPLOAD_TO_SERVER    10      /* Need to write to the server 
*/
 #define NETFS_RREQ_CONTENT_ENCRYPTION  11      /* Content encryption is in use 
*/
 #define NETFS_RREQ_CRYPT_IN_PLACE      12      /* Enc/dec in place in 
->io_iter */
+#define NETFS_RREQ_REPEAT_RMW          13      /* Need to repeat RMW cycle */
        const struct netfs_request_ops *netfs_ops;
        void (*cleanup)(struct netfs_io_request *req);
 };
@@ -478,6 +481,7 @@ static inline void netfs_inode_init(struct netfs_inode *ctx,
        ctx->flags = 0;
        ctx->min_bshift = 0;
        ctx->crypto_bshift = 0;
+       ctx->crypto_trailer = 0;
 #if IS_ENABLED(CONFIG_FSCACHE)
        ctx->cache = NULL;
 #endif
diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h
index 2f35057602fa..825946f510ee 100644
--- a/include/trace/events/netfs.h
+++ b/include/trace/events/netfs.h
@@ -33,6 +33,7 @@
        EM(NETFS_READPAGE,                      "RP")           \
        EM(NETFS_READ_FOR_WRITE,                "RW")           \
        EM(NETFS_WRITEBACK,                     "WB")           \
+       EM(NETFS_RMW_READ,                      "RM")           \
        EM(NETFS_UNBUFFERED_WRITE,              "UW")           \
        EM(NETFS_DIO_READ,                      "DR")           \
        E_(NETFS_DIO_WRITE,                     "DW")
--
Linux-cachefs mailing list
Linux-cachefs@redhat.com
https://listman.redhat.com/mailman/listinfo/linux-cachefs

Reply via email to