2011/4/2 Jeff Layton <[email protected]>:
> Have cifs_writepages issue asynchronous writes instead of waiting on
> each write call to complete before issuing another. This also allows us
> to return more quickly from writepages in the WB_SYNC_NONE case. It
> can just send out all of the I/Os and not wait around for the replies.
>
> In the WB_SYNC_ALL case, have it wait for all of the writes to complete
> after issuing them and return any errors that turn up from it. If those
> errors turn out to all be retryable (-EAGAIN), then retry the entire
> operation again.
>
> This also changes the page locking semantics a little bit. Instead of
> holding the page lock until the response is received, release it after
> doing the send. This will reduce contention for the page lock and should
> prevent processes that have the file mmap'ed from being blocked
> unnecessarily.
>
> Signed-off-by: Jeff Layton <[email protected]>
> ---
> fs/cifs/file.c | 256
> ++++++++++++++++++++++++++++++--------------------------
> 1 files changed, 136 insertions(+), 120 deletions(-)
>
> diff --git a/fs/cifs/file.c b/fs/cifs/file.c
> index da53246..dbdbe97 100644
> --- a/fs/cifs/file.c
> +++ b/fs/cifs/file.c
> @@ -1087,32 +1087,69 @@ static int cifs_partialpagewrite(struct page *page,
> unsigned from, unsigned to)
> return rc;
> }
>
> +/*
> + * walk a list of in progress async writes and wait for them to complete.
> + * Check the result code on each one.
> + *
> + * There's a clear heirarchy: 0 < EAGAIN < other errors
> + *
> + * If we get an EAGAIN return on a WB_SYNC_ALL writeout, then we need to
> + * go back and do the whole operation again. If we get other errors then
> + * the mapping already likely has AS_EIO or AS_ENOSPC set, so we can
> + * give up and just return an error.
> + */
> +static int
> +cifs_wait_for_write_completion(struct address_space *mapping,
> + struct list_head *pending)
> +{
> + int tmprc, rc = 0;
> + struct cifs_writedata *wdata;
> +
> + /* wait for writes to complete */
> + for (;;) {
> + /* anything left on list? */
> + spin_lock(&mapping->private_lock);
> + if (list_empty(pending)) {
> + spin_unlock(&mapping->private_lock);
> + break;
> + }
> +
> + /* dequeue an entry */
> + wdata = list_first_entry(pending, struct cifs_writedata,
> + pending);
> + list_del_init(&wdata->pending);
> + spin_unlock(&mapping->private_lock);
> +
> + /* wait for it to complete */
> + tmprc = wait_for_completion_killable(&wdata->completion);
> + if (tmprc)
> + rc = tmprc;
> +
> + if (wdata->result == -EAGAIN)
> + rc = rc ? rc : wdata->result;
> + else if (wdata->result != 0)
> + rc = wdata->result;
> +
> + kref_put(&wdata->refcount, cifs_writedata_release);
> + }
> +
> + return rc;
> +}
> +
> static int cifs_writepages(struct address_space *mapping,
> struct writeback_control *wbc)
> {
> - unsigned int bytes_to_write;
> - unsigned int bytes_written;
> - struct cifs_sb_info *cifs_sb;
> - int done = 0;
> + struct cifs_sb_info *cifs_sb = CIFS_SB(mapping->host->i_sb);
> + bool done = false, scanned = false, range_whole = false;
> pgoff_t end;
> pgoff_t index;
> - int range_whole = 0;
> - struct kvec *iov;
> - int len;
> - int n_iov = 0;
> - pgoff_t next;
> - int nr_pages;
> - __u64 offset = 0;
> + unsigned int pvec_pages;
> struct cifsFileInfo *open_file;
> - struct cifs_tcon *tcon;
> - struct cifsInodeInfo *cifsi = CIFS_I(mapping->host);
> + struct cifs_writedata *wdata;
> struct page *page;
> struct pagevec pvec;
> + struct list_head pending;
> int rc = 0;
> - int scanned = 0;
> - int xid;
> -
> - cifs_sb = CIFS_SB(mapping->host->i_sb);
>
> /*
> * If wsize is smaller that the page cache size, default to writing
> @@ -1121,27 +1158,19 @@ static int cifs_writepages(struct address_space
> *mapping,
> if (cifs_sb->wsize < PAGE_CACHE_SIZE)
> return generic_writepages(mapping, wbc);
>
> - iov = kmalloc(32 * sizeof(struct kvec), GFP_KERNEL);
> - if (iov == NULL)
> - return generic_writepages(mapping, wbc);
> -
> /*
> * if there's no open file, then this is likely to fail too,
> * but it'll at least handle the return. Maybe it should be
> * a BUG() instead?
> */
> open_file = find_writable_file(CIFS_I(mapping->host), false);
> - if (!open_file) {
> - kfree(iov);
> + if (!open_file)
> return generic_writepages(mapping, wbc);
> - }
> -
> - tcon = tlink_tcon(open_file->tlink);
> cifsFileInfo_put(open_file);
>
> - xid = GetXid();
> -
> + INIT_LIST_HEAD(&pending);
> pagevec_init(&pvec, 0);
> +
> if (wbc->range_cyclic) {
> index = mapping->writeback_index; /* Start from prev offset */
> end = -1;
> @@ -1149,23 +1178,26 @@ static int cifs_writepages(struct address_space
> *mapping,
> index = wbc->range_start >> PAGE_CACHE_SHIFT;
> end = wbc->range_end >> PAGE_CACHE_SHIFT;
> if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> - range_whole = 1;
> - scanned = 1;
> + range_whole = true;
> + scanned = true;
> }
> retry:
> while (!done && (index <= end) &&
> - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
> + (pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
> PAGECACHE_TAG_DIRTY,
> min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1))) {
> - int first;
> unsigned int i;
> + unsigned int nr_pages = 0;
> + pgoff_t next = 0;
>
> - first = -1;
> - next = 0;
> - n_iov = 0;
> - bytes_to_write = 0;
> + wdata = cifs_writedata_alloc();
> + if (!wdata) {
> + pagevec_release(&pvec);
> + rc = -ENOMEM;
> + break;
> + }
>
> - for (i = 0; i < nr_pages; i++) {
> + for (i = 0; i < pvec_pages; i++) {
> page = pvec.pages[i];
> /*
> * At this point we hold neither mapping->tree_lock nor
> @@ -1175,7 +1207,7 @@ retry:
> * mapping
> */
>
> - if (first < 0)
> + if (nr_pages == 0)
> lock_page(page);
> else if (!trylock_page(page))
> break;
> @@ -1186,7 +1218,7 @@ retry:
> }
>
> if (!wbc->range_cyclic && page->index > end) {
> - done = 1;
> + done = true;
> unlock_page(page);
> break;
> }
> @@ -1213,119 +1245,103 @@ retry:
> set_page_writeback(page);
>
> if (page_offset(page) >= mapping->host->i_size) {
> - done = 1;
> + done = true;
> unlock_page(page);
> end_page_writeback(page);
> break;
> }
>
> - /*
> - * BB can we get rid of this? pages are held by pvec
> - */
> page_cache_get(page);
> -
> - len = min(mapping->host->i_size - page_offset(page),
> - (loff_t)PAGE_CACHE_SIZE);
> -
> - /* reserve iov[0] for the smb header */
> - n_iov++;
> - iov[n_iov].iov_base = kmap(page);
> - iov[n_iov].iov_len = len;
> - bytes_to_write += len;
> -
> - if (first < 0) {
> - first = i;
> - offset = page_offset(page);
> - }
> + wdata->pages[i] = page;
> next = page->index + 1;
> - if (bytes_to_write + PAGE_CACHE_SIZE > cifs_sb->wsize)
> + ++nr_pages;
> +
> + /* stop here if we have enough for a full wsize write
> */
> + if ((nr_pages + 1) * PAGE_CACHE_SIZE > cifs_sb->wsize)
> break;
> }
> - if (n_iov) {
> -retry_write:
> - open_file = find_writable_file(CIFS_I(mapping->host),
> - false);
> - if (!open_file) {
> +
> + if (wdata->pages[0] == NULL) {
> + /* Need to re-find the pages we skipped */
> + index = pvec.pages[0]->index + 1;
> + pagevec_release(&pvec);
> + continue;
> + }
> +
> + pagevec_release(&pvec);
> + wdata->offset = page_offset(wdata->pages[0]);
> +
> + do {
> + if (wdata->cfile != NULL)
> + cifsFileInfo_put(wdata->cfile);
> + wdata->cfile =
> find_writable_file(CIFS_I(mapping->host),
> + false);
> + if (!wdata->cfile) {
> cERROR(1, "No writable handles for inode");
> rc = -EBADF;
> - } else {
> - rc = CIFSSMBWrite2(xid, tcon,
> open_file->netfid,
> - bytes_to_write, offset,
> - &bytes_written, iov, n_iov,
> - 0);
> - cifsFileInfo_put(open_file);
> + break;
> }
> -
> - cFYI(1, "Write2 rc=%d, wrote=%u", rc, bytes_written);
> -
> - /*
> - * For now, treat a short write as if nothing got
> - * written. A zero length write however indicates
> - * ENOSPC or EFBIG. We have no way to know which
> - * though, so call it ENOSPC for now. EFBIG would
> - * get translated to AS_EIO anyway.
> - *
> - * FIXME: make it take into account the data that did
> - * get written
> - */
> - if (rc == 0) {
> - if (bytes_written == 0)
> - rc = -ENOSPC;
> - else if (bytes_written < bytes_to_write)
> - rc = -EAGAIN;
> + if (wbc->sync_mode == WB_SYNC_ALL) {
> + spin_lock(&mapping->private_lock);
> + list_move(&wdata->pending, &pending);
> + spin_unlock(&mapping->private_lock);
> }
> + rc = cifs_async_writev(wdata);
> + } while (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN);
>
> - /* retry on data-integrity flush */
> - if (wbc->sync_mode == WB_SYNC_ALL && rc == -EAGAIN)
> - goto retry_write;
> -
> - /* fix the stats and EOF */
> - if (bytes_written > 0) {
> - cifs_stats_bytes_written(tcon, bytes_written);
> - cifs_update_eof(cifsi, offset, bytes_written);
> - }
> + for (i = 0; i < nr_pages; ++i)
> + unlock_page(wdata->pages[i]);
>
> - for (i = 0; i < n_iov; i++) {
> - page = pvec.pages[first + i];
> - /* on retryable write error, redirty page */
> + /* send failure -- clean up the mess */
> + if (rc != 0) {
> + for (i = 0; i < nr_pages; ++i) {
> if (rc == -EAGAIN)
> - redirty_page_for_writepage(wbc, page);
> - else if (rc != 0)
> - SetPageError(page);
> - kunmap(page);
> - unlock_page(page);
> - end_page_writeback(page);
> - page_cache_release(page);
> + redirty_page_for_writepage(wbc,
> + wdata->pages[i]);
> + else
> + SetPageError(wdata->pages[i]);
> + end_page_writeback(wdata->pages[i]);
> + page_cache_release(wdata->pages[i]);
> }
> -
> if (rc != -EAGAIN)
> mapping_set_error(mapping, rc);
> - else
> - rc = 0;
> -
> - if ((wbc->nr_to_write -= n_iov) <= 0)
> - done = 1;
> - index = next;
> - } else
> - /* Need to re-find the pages we skipped */
> - index = pvec.pages[0]->index + 1;
> + kref_put(&wdata->refcount, cifs_writedata_release);
> + } else if (wbc->sync_mode == WB_SYNC_NONE) {
> + kref_put(&wdata->refcount, cifs_writedata_release);
> + }
>
> - pagevec_release(&pvec);
> + if ((wbc->nr_to_write -= nr_pages) <= 0)
> + done = true;
> + index = next;
> }
> +
> if (!scanned && !done) {
> /*
> * We hit the last page and there is more work to be done: wrap
> * back to the start of the file
> */
> - scanned = 1;
> + scanned = true;
> index = 0;
> goto retry;
> }
> +
> + /*
> + * for WB_SYNC_ALL, we must wait until the writes complete. If any
> + * return -EAGAIN however, we need to retry the whole thing again.
> + * At that point nr_to_write will be all screwed up, but that
> + * shouldn't really matter for WB_SYNC_ALL (right?)
> + */
> + if (wbc->sync_mode == WB_SYNC_ALL) {
> + rc = cifs_wait_for_write_completion(mapping, &pending);
> + if (rc == -EAGAIN) {
> + index = wbc->range_start >> PAGE_CACHE_SHIFT;
> + goto retry;
> + }
> + }
> +
> if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
> mapping->writeback_index = index;
>
> - FreeXid(xid);
> - kfree(iov);
> return rc;
> }
>
> --
> 1.7.4
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-cifs" in
> the body of a message to [email protected]
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>
Good. I tested it and found out that it is ~20% faster when I write
~gigabyte file on LAN against Windows 7 (MaxWorkItems is set to 4096,
otherwise - get STATUS_INSUFF_SERVER_RESOURCES after a write
finishes).
As a test I do the following (python):
f = os.open('file', os.O_RDWR)
str = ''.join('q' for _ in xrange(4096))
for _ in xrange(250000):
os.write(f, str)
os.close(f)
Reviewed-and-Tested-by: Pavel Shilovsky <[email protected]>
--
Best regards,
Pavel Shilovsky.
--
To unsubscribe from this list: send the line "unsubscribe linux-cifs" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html