On Fri, 2026-06-05 at 03:31 -0700, Breno Leitao wrote: > Convert RDS socket's getsockopt implementation to use the new > getsockopt_iter callback with sockopt_t. > > Key changes: > - Replace (char __user *optval, int __user *optlen) with sockopt_t *opt > - Use opt->optlen for buffer length (input) and returned size (output) > - Use copy_to_iter() instead of put_user()/copy_to_user() > > The RDS_INFO_* snapshot path in rds_info_getsockopt() used to pin the > userspace buffer with pin_user_pages_fast() on the raw optval address; > the info producers then memcpy into those pages under a spinlock via > kmap_atomic() and so must not fault. Obtain the same page array and > starting offset from opt->iter_out with iov_iter_extract_pages(), which > pins for write because iter_out is ITER_DEST. > > The page array is preallocated here (sized with iov_iter_npages()) and > passed in, so iov_iter_extract_pages() fills it in place rather than > allocating one for us; RDS therefore keeps ownership of the array on > every return path and frees it itself. The rds_info_iterator / > rds_info_copy machinery and all producer callbacks are unchanged. > > Kernel buffers (ITER_KVEC) are not page-backed in a way the info > producers can use, so the RDS_INFO path returns -EOPNOTSUPP for them; > this matches the previous behaviour, where a kernel-buffer getsockopt > hit the WARN_ONCE() path in do_sock_getsockopt() and returned > -EOPNOTSUPP. The simple RDS_RECVERR and SO_RDS_TRANSPORT options keep > working for kernel buffers via copy_to_iter(). > > Signed-off-by: Breno Leitao <[email protected]>
Looks good, thanks for the quick turn around. Reviewed-by: Allison Henderson <[email protected]> > --- > net/rds/af_rds.c | 36 +++++++++++++++------------ > net/rds/info.c | 76 > ++++++++++++++++++++++++++++++++------------------------ > net/rds/info.h | 3 +-- > 3 files changed, 65 insertions(+), 50 deletions(-) > > diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c > index 6f4f9cf352bd..d5defe9172e3 100644 > --- a/net/rds/af_rds.c > +++ b/net/rds/af_rds.c > @@ -37,6 +37,7 @@ > #include <linux/in.h> > #include <linux/ipv6.h> > #include <linux/poll.h> > +#include <linux/uio.h> > #include <net/sock.h> > > #include "rds.h" > @@ -485,35 +486,36 @@ static int rds_setsockopt(struct socket *sock, int > level, int optname, > } > > static int rds_getsockopt(struct socket *sock, int level, int optname, > - char __user *optval, int __user *optlen) > + sockopt_t *opt) > { > struct rds_sock *rs = rds_sk_to_rs(sock->sk); > int ret = -ENOPROTOOPT, len; > int trans; > + int val; > > if (level != SOL_RDS) > goto out; > > - if (get_user(len, optlen)) { > - ret = -EFAULT; > - goto out; > - } > + len = opt->optlen; > > switch (optname) { > case RDS_INFO_FIRST ... RDS_INFO_LAST: > - ret = rds_info_getsockopt(sock, optname, optval, > - optlen); > + ret = rds_info_getsockopt(sock, optname, opt); > break; > > case RDS_RECVERR: > - if (len < sizeof(int)) > + if (len < sizeof(int)) { > ret = -EINVAL; > - else > - if (put_user(rs->rs_recverr, (int __user *) optval) || > - put_user(sizeof(int), optlen)) > + break; > + } > + val = rs->rs_recverr; > + if (copy_to_iter(&val, sizeof(int), &opt->iter_out) != > + sizeof(int)) { > ret = -EFAULT; > - else > + } else { > + opt->optlen = sizeof(int); > ret = 0; > + } > break; > case SO_RDS_TRANSPORT: > if (len < sizeof(int)) { > @@ -522,11 +524,13 @@ static int rds_getsockopt(struct socket *sock, int > level, int optname, > } > trans = (rs->rs_transport ? rs->rs_transport->t_type : > RDS_TRANS_NONE); /* unbound */ > - if (put_user(trans, (int __user *)optval) || > - put_user(sizeof(int), optlen)) > + if (copy_to_iter(&trans, sizeof(int), &opt->iter_out) != > + sizeof(int)) { > ret = -EFAULT; > - else > + } else { > + opt->optlen = sizeof(int); > ret = 0; > + } > break; > default: > break; > @@ -653,7 +657,7 @@ static const struct proto_ops rds_proto_ops = { > .listen = sock_no_listen, > .shutdown = sock_no_shutdown, > .setsockopt = rds_setsockopt, > - .getsockopt = rds_getsockopt, > + .getsockopt_iter = rds_getsockopt, > .sendmsg = rds_sendmsg, > .recvmsg = rds_recvmsg, > .mmap = sock_no_mmap, > diff --git a/net/rds/info.c b/net/rds/info.c > index 17061f6ff74e..21b32eb16559 100644 > --- a/net/rds/info.c > +++ b/net/rds/info.c > @@ -35,6 +35,7 @@ > #include <linux/slab.h> > #include <linux/proc_fs.h> > #include <linux/export.h> > +#include <linux/uio.h> > > #include "rds.h" > > @@ -144,60 +145,68 @@ void rds_info_copy(struct rds_info_iterator *iter, void > *data, > EXPORT_SYMBOL_GPL(rds_info_copy); > > /* > - * @optval points to the userspace buffer that the information snapshot > - * will be copied into. > - * > - * @optlen on input is the size of the buffer in userspace. @optlen > - * on output is the size of the requested snapshot in bytes. > + * @opt->iter_out describes the buffer that the information snapshot will be > + * copied into, and @opt->optlen is the size of that buffer on input. On > + * output @opt->optlen is set to the size of the requested snapshot in bytes. > * > * This function returns -errno if there is a failure, particularly -ENOSPC > - * if the given userspace buffer was not large enough to fit the snapshot. > - * On success it returns the positive number of bytes of each array element > - * in the snapshot. > + * if the given buffer was not large enough to fit the snapshot. On success > + * it returns the positive number of bytes of each array element in the > + * snapshot. > */ > -int rds_info_getsockopt(struct socket *sock, int optname, char __user > *optval, > - int __user *optlen) > +int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt) > { > struct rds_info_iterator iter; > struct rds_info_lengths lens; > unsigned long nr_pages = 0; > - unsigned long start; > rds_info_func func; > struct page **pages = NULL; > + size_t offset0 = 0; > + int npages = 0; > int ret; > int len; > int total; > > - if (get_user(len, optlen)) { > - ret = -EFAULT; > - goto out; > - } > + len = opt->optlen; > > /* check for all kinds of wrapping and the like */ > - start = (unsigned long)optval; > - if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) { > + if (len < 0 || len > INT_MAX - PAGE_SIZE + 1) { > ret = -EINVAL; > goto out; > } > > + /* The info producers write into the pages with kmap_atomic() while > + * holding a spinlock, so they need a genuine page-backed user buffer. > + */ > + if (!user_backed_iter(&opt->iter_out)) { > + ret = -EOPNOTSUPP; > + goto out; > + } > + > /* a 0 len call is just trying to probe its length */ > if (len == 0) > goto call_func; > > - nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) > - >> PAGE_SHIFT; > - > - pages = kmalloc_objs(struct page *, nr_pages); > + /* > + * Preallocate the page array and pass it in so that > + * iov_iter_extract_pages() fills it in place rather than allocating > + * one for us. Handing it a non-NULL array keeps ownership of the > + * array with us on every return path, instead of depending on the > + * iterator code to allocate and hand it back. > + */ > + npages = iov_iter_npages(&opt->iter_out, INT_MAX); > + pages = kvmalloc_array(npages, sizeof(*pages), GFP_KERNEL); > if (!pages) { > ret = -ENOMEM; > goto out; > } > - ret = pin_user_pages_fast(start, nr_pages, FOLL_WRITE, pages); > - if (ret != nr_pages) { > - if (ret > 0) > - nr_pages = ret; > - else > - nr_pages = 0; > + > + ret = iov_iter_extract_pages(&opt->iter_out, &pages, len, npages, > + 0, &offset0); > + if (ret < 0) > + goto out; > + nr_pages = DIV_ROUND_UP(offset0 + ret, PAGE_SIZE); > + if (ret != len) { > ret = -EAGAIN; /* XXX ? */ > goto out; > } > @@ -213,7 +222,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, > char __user *optval, > > iter.pages = pages; > iter.addr = NULL; > - iter.offset = start & (PAGE_SIZE - 1); > + iter.offset = offset0; > > func(sock, len, &iter, &lens); > BUG_ON(lens.each == 0); > @@ -230,13 +239,16 @@ int rds_info_getsockopt(struct socket *sock, int > optname, char __user *optval, > ret = lens.each; > } > > - if (put_user(len, optlen)) > - ret = -EFAULT; > + opt->optlen = len; > > out: > - if (pages) > + /* > + * iov_iter_extract_pages() pins only user-backed (ubuf) iters; > + * iov_iter_extract_will_pin() reports whether an unpin is owed here. > + */ > + if (pages && iov_iter_extract_will_pin(&opt->iter_out)) > unpin_user_pages_dirty_lock(pages, nr_pages, true); > - kfree(pages); > + kvfree(pages); > > return ret; > } > diff --git a/net/rds/info.h b/net/rds/info.h > index a069b51c4679..1aab62ab6d00 100644 > --- a/net/rds/info.h > +++ b/net/rds/info.h > @@ -21,8 +21,7 @@ typedef void (*rds_info_func)(struct socket *sock, unsigned > int len, > > void rds_info_register_func(int optname, rds_info_func func); > void rds_info_deregister_func(int optname, rds_info_func func); > -int rds_info_getsockopt(struct socket *sock, int optname, char __user > *optval, > - int __user *optlen); > +int rds_info_getsockopt(struct socket *sock, int optname, sockopt_t *opt); > void rds_info_copy(struct rds_info_iterator *iter, void *data, > unsigned long bytes); > void rds_info_iter_unmap(struct rds_info_iterator *iter); >

