Convert RDS socket's getsockopt implementation to use the new
getsockopt_iter callback with sockopt_t.

Key changes:
- Replace (char __user *optval, int __user *optlen) with sockopt_t *opt
- Use opt->optlen for buffer length (input) and returned size (output)
- Use copy_to_iter() instead of put_user()/copy_to_user()

The RDS_INFO_* snapshot path in rds_info_getsockopt() used to pin the
userspace buffer with pin_user_pages_fast() on the raw optval address;
the info producers then memcpy into those pages under a spinlock via
kmap_atomic() and so must not fault. Obtain the same page array and
starting offset from opt->iter_out with iov_iter_extract_pages(), which
pins for write because iter_out is ITER_DEST.

The page array is preallocated here (sized with iov_iter_npages()) and
passed in, so iov_iter_extract_pages() fills it in place rather than
allocating one for us; RDS therefore keeps ownership of the array on
every return path and frees it itself. The rds_info_iterator /
rds_info_copy machinery and all producer callbacks are unchanged.

Kernel buffers (ITER_KVEC) are not page-backed in a way the info
producers can use, so the RDS_INFO path returns -EOPNOTSUPP for them;
this matches the previous behaviour, where a kernel-buffer getsockopt
hit the WARN_ONCE() path in do_sock_getsockopt() and returned
-EOPNOTSUPP. The simple RDS_RECVERR and SO_RDS_TRANSPORT options keep
working for kernel buffers via copy_to_iter().

Signed-off-by: Breno Leitao <[email protected]>
---
 net/rds/af_rds.c | 36 ++++++++++++++++-------------
 net/rds/info.c   | 70 +++++++++++++++++++++++++++++++-------------------------
 net/rds/info.h   |  3 +--
 3 files changed, 60 insertions(+), 49 deletions(-)

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 6f4f9cf352bd..d5defe9172e3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -37,6 +37,7 @@
 #include <linux/in.h>
 #include <linux/ipv6.h>
 #include <linux/poll.h>
+#include <linux/uio.h>
 #include <net/sock.h>
 
 #include "rds.h"
@@ -485,35 +486,36 @@ static int rds_setsockopt(struct socket *sock, int level, 
int optname,
 }
 
 static int rds_getsockopt(struct socket *sock, int level, int optname,
-                         char __user *optval, int __user *optlen)
+                         sockopt_t *opt)
 {
        struct rds_sock *rs = rds_sk_to_rs(sock->sk);
        int ret = -ENOPROTOOPT, len;
        int trans;
+       int val;
 
        if (level != SOL_RDS)
                goto out;
 
-       if (get_user(len, optlen)) {
-               ret = -EFAULT;
-               goto out;
-       }
+       len = opt->optlen;
 
        switch (optname) {
        case RDS_INFO_FIRST ... RDS_INFO_LAST:
-               ret = rds_info_getsockopt(sock, optname, optval,
-                                         optlen);
+               ret = rds_info_getsockopt(sock, optname, opt);
                break;
 
        case RDS_RECVERR:
-               if (len < sizeof(int))
+               if (len < sizeof(int)) {
                        ret = -EINVAL;
-               else
-               if (put_user(rs->rs_recverr, (int __user *) optval) ||
-                   put_user(sizeof(int), optlen))
+                       break;
+               }
+               val = rs->rs_recverr;
+               if (copy_to_iter(&val, sizeof(int), &opt->iter_out) !=
+                   sizeof(int)) {
                        ret = -EFAULT;
-               else
+               } else {
+                       opt->optlen = sizeof(int);
                        ret = 0;
+               }
                break;
        case SO_RDS_TRANSPORT:
                if (len < sizeof(int)) {
@@ -522,11 +524,13 @@ static int rds_getsockopt(struct socket *sock, int level, 
int optname,
                }
                trans = (rs->rs_transport ? rs->rs_transport->t_type :
                         RDS_TRANS_NONE); /* unbound */
-               if (put_user(trans, (int __user *)optval) ||
-                   put_user(sizeof(int), optlen))
+               if (copy_to_iter(&trans, sizeof(int), &opt->iter_out) !=
+                   sizeof(int)) {
                        ret = -EFAULT;
-               else
+               } else {
+                       opt->optlen = sizeof(int);
                        ret = 0;
+               }
                break;
        default:
                break;
@@ -653,7 +657,7 @@ static const struct proto_ops rds_proto_ops = {
        .listen =       sock_no_listen,
        .shutdown =     sock_no_shutdown,
        .setsockopt =   rds_setsockopt,
-       .getsockopt =   rds_getsockopt,
+       .getsockopt_iter =      rds_getsockopt,
        .sendmsg =      rds_sendmsg,
        .recvmsg =      rds_recvmsg,
        .mmap =         sock_no_mmap,
diff --git a/net/rds/info.c b/net/rds/info.c
index f1b29994934a..171838782595 100644
--- a/net/rds/info.c
+++ b/net/rds/info.c
@@ -35,6 +35,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/export.h>
+#include <linux/uio.h>
 
 #include "rds.h"
 
@@ -144,60 +145,68 @@ void rds_info_copy(struct rds_info_iterator *iter, void 
*data,
 EXPORT_SYMBOL_GPL(rds_info_copy);
 
 /*
- * @optval points to the userspace buffer that the information snapshot
- * will be copied into.
- *
- * @optlen on input is the size of the buffer in userspace.  @optlen
- * on output is the size of the requested snapshot in bytes.
+ * @opt->iter_out describes the buffer that the information snapshot will be
+ * copied into, and @opt->optlen is the size of that buffer on input.  On
+ * output @opt->optlen is set to the size of the requested snapshot in bytes.
  *
  * This function returns -errno if there is a failure, particularly -ENOSPC
- * if the given userspace buffer was not large enough to fit the snapshot.
- * On success it returns the positive number of bytes of each array element
- * in the snapshot.
+ * if the given buffer was not large enough to fit the snapshot.  On success
+ * it returns the positive number of bytes of each array element in the
+ * snapshot.
  */
-int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
-                       int __user *optlen)
+int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt)
 {
        struct rds_info_iterator iter;
        struct rds_info_lengths lens;
        unsigned long nr_pages = 0;
-       unsigned long start;
        rds_info_func func;
        struct page **pages = NULL;
+       size_t offset0 = 0;
+       int npages = 0;
        int ret;
        int len;
        int total;
 
-       if (get_user(len, optlen)) {
-               ret = -EFAULT;
-               goto out;
-       }
+       len = opt->optlen;
 
        /* check for all kinds of wrapping and the like */
-       start = (unsigned long)optval;
-       if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
+       if (len < 0 || len > INT_MAX - PAGE_SIZE + 1) {
                ret = -EINVAL;
                goto out;
        }
 
+       /* The info producers write into the pages with kmap_atomic() while
+        * holding a spinlock, so they need a genuine page-backed user buffer.
+        */
+       if (iov_iter_is_kvec(&opt->iter_out)) {
+               ret = -EOPNOTSUPP;
+               goto out;
+       }
+
        /* a 0 len call is just trying to probe its length */
        if (len == 0)
                goto call_func;
 
-       nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
-                       >> PAGE_SHIFT;
-
-       pages = kmalloc_objs(struct page *, nr_pages);
+       /*
+        * Preallocate the page array and pass it in so that
+        * iov_iter_extract_pages() fills it in place rather than allocating
+        * one for us.  Handing it a non-NULL array keeps ownership of the
+        * array with us on every return path, instead of depending on the
+        * iterator code to allocate and hand it back.
+        */
+       npages = iov_iter_npages(&opt->iter_out, INT_MAX);
+       pages = kvmalloc_array(npages, sizeof(*pages), GFP_KERNEL);
        if (!pages) {
                ret = -ENOMEM;
                goto out;
        }
-       ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages);
-       if (ret != nr_pages) {
-               if (ret > 0)
-                       nr_pages = ret;
-               else
-                       nr_pages = 0;
+
+       ret = iov_iter_extract_pages(&opt->iter_out, &pages, len, npages,
+                                    0, &offset0);
+       if (ret < 0)
+               goto out;
+       nr_pages = DIV_ROUND_UP(offset0 + ret, PAGE_SIZE);
+       if (ret != len) {
                ret = -EAGAIN; /* XXX ? */
                goto out;
        }
@@ -213,7 +222,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, 
char __user *optval,
 
        iter.pages = pages;
        iter.addr = NULL;
-       iter.offset = start & (PAGE_SIZE - 1);
+       iter.offset = offset0;
 
        func(sock, len, &iter, &lens);
        BUG_ON(lens.each == 0);
@@ -230,13 +239,12 @@ int rds_info_getsockopt(struct socket *sock, int optname, 
char __user *optval,
                ret = lens.each;
        }
 
-       if (put_user(len, optlen))
-               ret = -EFAULT;
+       opt->optlen = len;
 
 out:
        if (pages)
                unpin_user_pages(pages, nr_pages);
-       kfree(pages);
+       kvfree(pages);
 
        return ret;
 }
diff --git a/net/rds/info.h b/net/rds/info.h
index a069b51c4679..1aab62ab6d00 100644
--- a/net/rds/info.h
+++ b/net/rds/info.h
@@ -21,8 +21,7 @@ typedef void (*rds_info_func)(struct socket *sock, unsigned 
int len,
 
 void rds_info_register_func(int optname, rds_info_func func);
 void rds_info_deregister_func(int optname, rds_info_func func);
-int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
-                       int __user *optlen);
+int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt);
 void rds_info_copy(struct rds_info_iterator *iter, void *data,
                   unsigned long bytes);
 void rds_info_iter_unmap(struct rds_info_iterator *iter);

-- 
2.54.0


Reply via email to