From: Willem de Bruijn <will...@google.com>

Add MSG_ZEROCOPY support to inet/dgram. This includes udplite.

Tested:
  loopback test snd_zerocopy_lo -u -z produces

  without zerocopy (-u):
    rx=173940 (10854 MB) tx=173940 txc=0
    rx=367026 (22904 MB) tx=367026 txc=0
    rx=564078 (35201 MB) tx=564078 txc=0
    rx=756588 (47214 MB) tx=756588 txc=0

  with zerocopy (-u -z):
    rx=377994 (23588 MB) tx=377994 txc=377980
    rx=792654 (49465 MB) tx=792654 txc=792632
    rx=1209582 (75483 MB) tx=1209582 txc=1209552
    rx=1628376 (101618 MB) tx=1628376 txc=1628338

  loopback test currently fails with corking, due to
  CHECKSUM_PARTIAL being disabled with UDP_CORK after commit
  d749c9cbffd6 ("ipv4: no CHECKSUM_PARTIAL on MSG_MORE corked sockets")

  I will suggest to allow it on NETIF_F_LOOPBACK.

Signed-off-by: Willem de Bruijn <will...@google.com>
---
 include/linux/skbuff.h |  5 +++++
 net/ipv4/ip_output.c   | 34 +++++++++++++++++++++++++++++-----
 2 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6ad1724ceb60..9e7386f3f7a8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -424,6 +424,11 @@ struct ubuf_info {
 
 #define skb_uarg(SKB)  ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
 
+#define sock_can_zerocopy(sk, rt, csummode) \
+       ((rt->dst.dev->features & NETIF_F_SG) && \
+        ((sk->sk_type == SOCK_RAW) || \
+         (sk->sk_type == SOCK_DGRAM && csummode & CHECKSUM_UNNECESSARY)))
+
 struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
 struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
                                        struct ubuf_info *uarg);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 737ce826d7ec..9e0110d8a429 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -919,7 +919,7 @@ static int __ip_append_data(struct sock *sk,
 {
        struct inet_sock *inet = inet_sk(sk);
        struct sk_buff *skb;
-
+       struct ubuf_info *uarg = NULL;
        struct ip_options *opt = cork->opt;
        int hh_len;
        int exthdrlen;
@@ -963,9 +963,16 @@ static int __ip_append_data(struct sock *sk,
            !exthdrlen)
                csummode = CHECKSUM_PARTIAL;
 
+       if (flags & MSG_ZEROCOPY && length &&
+           sock_can_zerocopy(sk, rt, skb ? skb->ip_summed : csummode)) {
+               uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+               if (!uarg)
+                       return -ENOBUFS;
+       }
+
        cork->length += length;
        if ((((length + fragheaderlen) > mtu) || (skb && skb_is_gso(skb))) &&
-           (sk->sk_protocol == IPPROTO_UDP) &&
+           (sk->sk_protocol == IPPROTO_UDP) && !uarg &&
            (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
            (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
                err = ip_ufo_append_data(sk, queue, getfrag, from, length,
@@ -1017,6 +1024,8 @@ static int __ip_append_data(struct sock *sk,
                        if ((flags & MSG_MORE) &&
                            !(rt->dst.dev->features&NETIF_F_SG))
                                alloclen = mtu;
+                       else if (uarg)
+                               alloclen = min_t(int, fraglen, MAX_HEADER);
                        else
                                alloclen = fraglen;
 
@@ -1059,11 +1068,12 @@ static int __ip_append_data(struct sock *sk,
                        cork->tx_flags = 0;
                        skb_shinfo(skb)->tskey = tskey;
                        tskey = 0;
+                       skb_zcopy_set(skb, uarg);
 
                        /*
                         *      Find where to start putting bytes.
                         */
-                       data = skb_put(skb, fraglen + exthdrlen);
+                       data = skb_put(skb, alloclen);
                        skb_set_network_header(skb, exthdrlen);
                        skb->transport_header = (skb->network_header +
                                                 fragheaderlen);
@@ -1079,7 +1089,9 @@ static int __ip_append_data(struct sock *sk,
                                pskb_trim_unique(skb_prev, maxfraglen);
                        }
 
-                       copy = datalen - transhdrlen - fraggap;
+                       copy = min(datalen,
+                                  alloclen - exthdrlen - fragheaderlen);
+                       copy -= transhdrlen - fraggap;
                        if (copy > 0 && getfrag(from, data + transhdrlen, 
offset, copy, fraggap, skb) < 0) {
                                err = -EFAULT;
                                kfree_skb(skb);
@@ -1087,7 +1099,7 @@ static int __ip_append_data(struct sock *sk,
                        }
 
                        offset += copy;
-                       length -= datalen - fraggap;
+                       length -= copy + transhdrlen;
                        transhdrlen = 0;
                        exthdrlen = 0;
                        csummode = CHECKSUM_NONE;
@@ -1115,6 +1127,17 @@ static int __ip_append_data(struct sock *sk,
                                err = -EFAULT;
                                goto error;
                        }
+               } else if (uarg) {
+                       struct iov_iter *iter;
+
+                       if (sk->sk_type == SOCK_RAW)
+                               iter = &((struct msghdr **)from)[0]->msg_iter;
+                       else
+                               iter = &((struct msghdr *)from)->msg_iter;
+                       err = skb_zerocopy_add_frags_iter(sk, skb, iter, copy, 
uarg);
+                       if (err < 0)
+                               goto error;
+                       copy = err;
                } else {
                        int i = skb_shinfo(skb)->nr_frags;
 
@@ -1155,6 +1178,7 @@ static int __ip_append_data(struct sock *sk,
 error_efault:
        err = -EFAULT;
 error:
+       sock_zerocopy_put_abort(uarg);
        cork->length -= length;
        IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
        return err;
-- 
2.11.0.483.g087da7b7c-goog

Reply via email to