On Tue, 12 Apr 2011 14:07:04 +0400
Pavel Shilovsky <[email protected]> wrote:

> 2011/4/2 Jeff Layton <[email protected]>:
> > Have cifs_writepages issue asynchronous writes instead of waiting on
> > each write call to complete before issuing another. This also allows us
> > to return more quickly from writepages in the WB_SYNC_NONE case. It
> > can just send out all of the I/Os and not wait around for the replies.
> >
> > In the WB_SYNC_ALL case, have it wait for all of the writes to complete
> > after issuing them and return any errors that turn up from it. If those
> > errors turn out to all be retryable (-EAGAIN), then retry the entire
> > operation again.
> >
> > This also changes the page locking semantics a little bit. Instead of
> > holding the page lock until the response is received, release it after
> > doing the send. This will reduce contention for the page lock and should
> > prevent processes that have the file mmap'ed from being blocked
> > unnecessarily.
> >
> > Signed-off-by: Jeff Layton <[email protected]>
> > ---
> >  fs/cifs/file.c |  256 
> > ++++++++++++++++++++++++++++++--------------------------
> >  1 files changed, 136 insertions(+), 120 deletions(-)
> >
> > diff --git a/fs/cifs/file.c b/fs/cifs/file.c
> > index da53246..dbdbe97 100644
> > --- a/fs/cifs/file.c
> > +++ b/fs/cifs/file.c
> > @@ -1087,32 +1087,69 @@ static int cifs_partialpagewrite(struct page *page, 
> > unsigned from, unsigned to)
> >        return rc;
> >  }
> >
> > +/*
> > + * walk a list of in progress async writes and wait for them to complete.
> > + * Check the result code on each one.
> > + *
> > + * There's a clear heirarchy: 0 < EAGAIN < other errors
> > + *
> > + * If we get an EAGAIN return on a WB_SYNC_ALL writeout, then we need to
> > + * go back and do the whole operation again. If we get other errors then
> > + * the mapping already likely has AS_EIO or AS_ENOSPC set, so we can
> > + * give up and just return an error.
> > + */
> > +static int
> > +cifs_wait_for_write_completion(struct address_space *mapping,
> > +                               struct list_head *pending)
> > +{
> > +       int tmprc, rc = 0;
> > +       struct cifs_writedata *wdata;
> > +
> > +       /* wait for writes to complete */
> > +       for (;;) {
> > +               /* anything left on list? */
> > +               spin_lock(&mapping->private_lock);
> > +               if (list_empty(pending)) {
> > +                       spin_unlock(&mapping->private_lock);
> > +                       break;
> > +               }
> > +
> > +               /* dequeue an entry */
> > +               wdata = list_first_entry(pending, struct cifs_writedata,
> > +                                        pending);
> > +               list_del_init(&wdata->pending);
> > +               spin_unlock(&mapping->private_lock);
> > +
> > +               /* wait for it to complete */
> > +               tmprc = wait_for_completion_killable(&wdata->completion);
> > +               if (tmprc)
> > +                       rc = tmprc;
> > +
> > +               if (wdata->result == -EAGAIN)
> > +                       rc = rc ? rc : wdata->result;
> > +               else if (wdata->result != 0)
> > +                       rc = wdata->result;
> > +
> > +               kref_put(&wdata->refcount, cifs_writedata_release);
> > +       }
> > +
> > +       return rc;
> > +}
> > +
> >  static int cifs_writepages(struct address_space *mapping,
> >                           struct writeback_control *wbc)
> >  {
> > -       unsigned int bytes_to_write;
> > -       unsigned int bytes_written;
> > -       struct cifs_sb_info *cifs_sb;
> > -       int done = 0;
> > +       struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
> > +       bool done = false, scanned = false, range_whole = false;
> >        pgoff_t end;
> >        pgoff_t index;
> > -       int range_whole = 0;
> > -       struct kvec *iov;
> > -       int len;
> > -       int n_iov = 0;
> > -       pgoff_t next;
> > -       int nr_pages;
> > -       __u64 offset = 0;
> > +       unsigned int pvec_pages;
> >        struct cifsFileInfo *open_file;
> > -       struct cifs_tcon *tcon;
> > -       struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
> > +       struct cifs_writedata *wdata;
> >        struct page *page;
> >        struct pagevec pvec;
> > +       struct list_head pending;
> >        int rc = 0;
> > -       int scanned = 0;
> > -       int xid;
> > -
> > -       cifs_sb = CIFS_SB(mapping->host->i_sb);
> >
> >        /*
> >         * If wsize is smaller that the page cache size, default to writing
> > @@ -1121,27 +1158,19 @@ static int cifs_writepages(struct address_space 
> > *mapping,
> >        if (cifs_sb->wsize < PAGE_CACHE_SIZE)
> >                return generic_writepages(mapping, wbc);
> >
> > -       iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
> > -       if (iov == NULL)
> > -               return generic_writepages(mapping, wbc);
> > -
> >        /*
> >         * if there's no open file, then this is likely to fail too,
> >         * but it'll at least handle the return. Maybe it should be
> >         * a BUG() instead?
> >         */
> >        open_file = find_writable_file(CIFS_I(mapping->host), false);
> > -       if (!open_file) {
> > -               kfree(iov);
> > +       if (!open_file)
> >                return generic_writepages(mapping, wbc);
> > -       }
> > -
> > -       tcon = tlink_tcon(open_file->tlink);
> >        cifsFileInfo_put(open_file);
> >
> > -       xid = GetXid();
> > -
> > +       INIT_LIST_HEAD(&pending);
> >        pagevec_init(&pvec, 0);
> > +
> >        if (wbc->range_cyclic) {
> >                index = mapping->writeback_index; /* Start from prev offset 
> > */
> >                end = -1;
> > @@ -1149,23 +1178,26 @@ static int cifs_writepages(struct address_space 
> > *mapping,
> >                index = wbc->range_start >> PAGE_CACHE_SHIFT;
> >                end = wbc->range_end >> PAGE_CACHE_SHIFT;
> >                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> > -                       range_whole = 1;
> > -               scanned = 1;
> > +                       range_whole = true;
> > +               scanned = true;
> >        }
> >  retry:
> >        while (!done && (index <= end) &&
> > -              (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
> > +              (pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
> >                        PAGECACHE_TAG_DIRTY,
> >                        min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1))) {
> > -               int first;
> >                unsigned int i;
> > +               unsigned int nr_pages = 0;
> > +               pgoff_t next = 0;
> >
> > -               first = -1;
> > -               next = 0;
> > -               n_iov = 0;
> > -               bytes_to_write = 0;
> > +               wdata = cifs_writedata_alloc();
> > +               if (!wdata) {
> > +                       pagevec_release(&pvec);
> > +                       rc = -ENOMEM;
> > +                       break;
> > +               }
> >
> > -               for (i = 0; i < nr_pages; i++) {
> > +               for (i = 0; i < pvec_pages; i++) {
> >                        page = pvec.pages[i];
> >                        /*
> >                         * At this point we hold neither mapping->tree_lock 
> > nor
> > @@ -1175,7 +1207,7 @@ retry:
> >                         * mapping
> >                         */
> >
> > -                       if (first < 0)
> > +                       if (nr_pages == 0)
> >                                lock_page(page);
> >                        else if (!trylock_page(page))
> >                                break;
> > @@ -1186,7 +1218,7 @@ retry:
> >                        }
> >
> >                        if (!wbc->range_cyclic && page->index > end) {
> > -                               done = 1;
> > +                               done = true;
> >                                unlock_page(page);
> >                                break;
> >                        }
> > @@ -1213,119 +1245,103 @@ retry:
> >                        set_page_writeback(page);
> >
> >                        if (page_offset(page) >= mapping->host->i_size) {
> > -                               done = 1;
> > +                               done = true;
> >                                unlock_page(page);
> >                                end_page_writeback(page);
> >                                break;
> >                        }
> >
> > -                       /*
> > -                        * BB can we get rid of this?  pages are held by 
> > pvec
> > -                        */
> >                        page_cache_get(page);
> > -
> > -                       len = min(mapping->host->i_size - page_offset(page),
> > -                                 (loff_t)PAGE_CACHE_SIZE);
> > -
> > -                       /* reserve iov[0] for the smb header */
> > -                       n_iov++;
> > -                       iov[n_iov].iov_base = kmap(page);
> > -                       iov[n_iov].iov_len = len;
> > -                       bytes_to_write += len;
> > -
> > -                       if (first < 0) {
> > -                               first = i;
> > -                               offset = page_offset(page);
> > -                       }
> > +                       wdata->pages[i] = page;
> >                        next = page->index + 1;
> > -                       if (bytes_to_write + PAGE_CACHE_SIZE > 
> > cifs_sb->wsize)
> > +                       ++nr_pages;
> > +
> > +                       /* stop here if we have enough for a full wsize 
> > write */
> > +                       if ((nr_pages + 1) * PAGE_CACHE_SIZE > 
> > cifs_sb->wsize)
> >                                break;
> >                }
> > -               if (n_iov) {
> > -retry_write:
> > -                       open_file = 
> > find_writable_file(CIFS_I(mapping->host),
> > -                                                       false);
> > -                       if (!open_file) {
> > +
> > +               if (wdata->pages[0] == NULL) {
> > +                       /* Need to re-find the pages we skipped */
> > +                       index = pvec.pages[0]->index + 1;
> > +                       pagevec_release(&pvec);
> > +                       continue;
> > +               }
> > +
> > +               pagevec_release(&pvec);
> > +               wdata->offset = page_offset(wdata->pages[0]);
> > +
> > +               do {
> > +                       if (wdata->cfile != NULL)
> > +                               cifsFileInfo_put(wdata->cfile);
> > +                       wdata->cfile = 
> > find_writable_file(CIFS_I(mapping->host),
> > +                                                         false);
> > +                       if (!wdata->cfile) {
> >                                cERROR(1, "No writable handles for inode");
> >                                rc = -EBADF;
> > -                       } else {
> > -                               rc = CIFSSMBWrite2(xid, tcon, 
> > open_file->netfid,
> > -                                                  bytes_to_write, offset,
> > -                                                  &bytes_written, iov, 
> > n_iov,
> > -                                                  0);
> > -                               cifsFileInfo_put(open_file);
> > +                               break;
> >                        }
> > -
> > -                       cFYI(1, "Write2 rc=%d, wrote=%u", rc, 
> > bytes_written);
> > -
> > -                       /*
> > -                        * For now, treat a short write as if nothing got
> > -                        * written. A zero length write however indicates
> > -                        * ENOSPC or EFBIG. We have no way to know which
> > -                        * though, so call it ENOSPC for now. EFBIG would
> > -                        * get translated to AS_EIO anyway.
> > -                        *
> > -                        * FIXME: make it take into account the data that 
> > did
> > -                        *        get written
> > -                        */
> > -                       if (rc == 0) {
> > -                               if (bytes_written == 0)
> > -                                       rc = -ENOSPC;
> > -                               else if (bytes_written < bytes_to_write)
> > -                                       rc = -EAGAIN;
> > +                       if (wbc->sync_mode == WB_SYNC_ALL) {
> > +                               spin_lock(&mapping->private_lock);
> > +                               list_move(&wdata->pending, &pending);
> > +                               spin_unlock(&mapping->private_lock);
> >                        }
> > +                       rc = cifs_async_writev(wdata);
> > +               } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
> >
> > -                       /* retry on data-integrity flush */
> > -                       if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
> > -                               goto retry_write;
> > -
> > -                       /* fix the stats and EOF */
> > -                       if (bytes_written > 0) {
> > -                               cifs_stats_bytes_written(tcon, 
> > bytes_written);
> > -                               cifs_update_eof(cifsi, offset, 
> > bytes_written);
> > -                       }
> > +               for (i = 0; i < nr_pages; ++i)
> > +                       unlock_page(wdata->pages[i]);
> >
> > -                       for (i = 0; i < n_iov; i++) {
> > -                               page = pvec.pages[first + i];
> > -                               /* on retryable write error, redirty page */
> > +               /* send failure -- clean up the mess */
> > +               if (rc != 0) {
> > +                       for (i = 0; i < nr_pages; ++i) {
> >                                if (rc == -EAGAIN)
> > -                                       redirty_page_for_writepage(wbc, 
> > page);
> > -                               else if (rc != 0)
> > -                                       SetPageError(page);
> > -                               kunmap(page);
> > -                               unlock_page(page);
> > -                               end_page_writeback(page);
> > -                               page_cache_release(page);
> > +                                       redirty_page_for_writepage(wbc,
> > +                                                          wdata->pages[i]);
> > +                               else
> > +                                       SetPageError(wdata->pages[i]);
> > +                               end_page_writeback(wdata->pages[i]);
> > +                               page_cache_release(wdata->pages[i]);
> >                        }
> > -
> >                        if (rc != -EAGAIN)
> >                                mapping_set_error(mapping, rc);
> > -                       else
> > -                               rc = 0;
> > -
> > -                       if ((wbc->nr_to_write -= n_iov) <= 0)
> > -                               done = 1;
> > -                       index = next;
> > -               } else
> > -                       /* Need to re-find the pages we skipped */
> > -                       index = pvec.pages[0]->index + 1;
> > +                       kref_put(&wdata->refcount, cifs_writedata_release);
> > +               } else if (wbc->sync_mode == WB_SYNC_NONE) {
> > +                       kref_put(&wdata->refcount, cifs_writedata_release);
> > +               }
> >
> > -               pagevec_release(&pvec);
> > +               if ((wbc->nr_to_write -= nr_pages) <= 0)
> > +                       done = true;
> > +               index = next;
> >        }
> > +
> >        if (!scanned && !done) {
> >                /*
> >                 * We hit the last page and there is more work to be done: 
> > wrap
> >                 * back to the start of the file
> >                 */
> > -               scanned = 1;
> > +               scanned = true;
> >                index = 0;
> >                goto retry;
> >        }
> > +
> > +       /*
> > +        * for WB_SYNC_ALL, we must wait until the writes complete. If any
> > +        * return -EAGAIN however, we need to retry the whole thing again.
> > +        * At that point nr_to_write will be all screwed up, but that
> > +        * shouldn't really matter for WB_SYNC_ALL (right?)
> > +        */
> > +       if (wbc->sync_mode == WB_SYNC_ALL) {
> > +               rc = cifs_wait_for_write_completion(mapping, &pending);
> > +               if (rc == -EAGAIN) {
> > +                       index = wbc->range_start >> PAGE_CACHE_SHIFT;
> > +                       goto retry;
> > +               }
> > +       }
> > +
> >        if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> >                mapping->writeback_index = index;
> >
> > -       FreeXid(xid);
> > -       kfree(iov);
> >        return rc;
> >  }
> >
> > --
> > 1.7.4
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-cifs" in
> > the body of a message to [email protected]
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >
> 
> Good. I tested it and found out that it is ~20% faster when I write
> ~gigabyte file on LAN against Windows 7 (MaxWorkItems is set to 4096,
> otherwise - get STATUS_INSUFF_SERVER_RESOURCES after a write
> finishes).
> 
> As a test I do the following (python):
>   f = os.open('file', os.O_RDWR)
>   str = ''.join('q' for _ in xrange(4096))
>   for _ in xrange(250000):
>       os.write(f, str)
>   os.close(f)
> 
> Reviewed-and-Tested-by: Pavel Shilovsky <[email protected]>
> 

Thanks for testing it! Yes, I think I'll need to make this patch
respect the cifs_max_pending module parameter so we don't end up with
too many outstanding writes at a time. At the same time, we probably
ought to see about raising that above the hardcoded limit of 50, or
come up with a better heuristic for autonegotiating it.

I should have a new set in about a week or so, but I also need to do a
bit of research on how best to negotiate a larger wsize. That should
help performance even more I think.

-- 
Jeff Layton <[email protected]>
--
To unsubscribe from this list: send the line "unsubscribe linux-cifs" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to