For both IPv4 and IPv6, if we can't match errors to a socket, try
tunnels before ignoring them. Look up a socket with the original source
and destination ports as found in the UDP packet inside the ICMP payload,
this will work for tunnels that force the same destination port for both
endpoints, i.e. VxLAN and GENEVE.

For IPv6 redirect messages, call ip6_redirect() directly with the output
interface argument set to the interface we received the packet from (as
it's the very interface we should build the exception on), otherwise the
new nexthop will be rejected. There's no such need for IPv4.

Tunnels can now export an encap_err_lookup() operation that indicates a
match. Pass the packet to the lookup function, and if the tunnel driver
reports a matching association, continue with regular ICMP error handling.

Reviewed-by: Sabrina Dubroca <s...@queasysnail.net>
Signed-off-by: Stefano Brivio <sbri...@redhat.com>
---
 include/linux/udp.h      |  1 +
 include/net/udp_tunnel.h |  3 ++
 net/ipv4/udp.c           | 76 +++++++++++++++++++++++++++++++-----
 net/ipv4/udp_tunnel.c    |  1 +
 net/ipv6/udp.c           | 83 ++++++++++++++++++++++++++++++++++------
 5 files changed, 144 insertions(+), 20 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 320d49d85484..c8410837f044 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -71,6 +71,7 @@ struct udp_sock {
         * For encapsulation sockets.
         */
        int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
+       int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
        void (*encap_destroy)(struct sock *sk);
 
        /* GRO functions for UDP socket */
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index fe680ab6b15a..bf2f84984392 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -64,6 +64,8 @@ static inline int udp_sock_create(struct net *net,
 }
 
 typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
+typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk,
+                                            struct sk_buff *skb);
 typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
 typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
                                                    struct list_head *head,
@@ -76,6 +78,7 @@ struct udp_tunnel_sock_cfg {
        /* Used for setting up udp_sock fields, see udp.h for details */
        __u8  encap_type;
        udp_tunnel_encap_rcv_t encap_rcv;
+       udp_tunnel_encap_err_lookup_t encap_err_lookup;
        udp_tunnel_encap_destroy_t encap_destroy;
        udp_tunnel_gro_receive_t gro_receive;
        udp_tunnel_gro_complete_t gro_complete;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ca3ed931f2a9..1f054a85062d 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -585,6 +585,59 @@ static inline bool __udp_is_mcast_sock(struct net *net, 
struct sock *sk,
        return true;
 }
 
+DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
+void udp_encap_enable(void)
+{
+       static_branch_enable(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_enable);
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force 
the
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
+ * tunnel implementation to match the error against a valid association.
+ *
+ * Return the socket if we have a match.
+ */
+static struct sock *__udp4_lib_err_encap(struct net *net,
+                                        const struct iphdr *iph,
+                                        struct udphdr *uh,
+                                        struct udp_table *udptable,
+                                        struct sk_buff *skb)
+{
+       int (*lookup)(struct sock *sk, struct sk_buff *skb);
+       int network_offset, transport_offset;
+       struct udp_sock *up;
+       struct sock *sk;
+
+       sk = __udp4_lib_lookup(net, iph->daddr, uh->source,
+                              iph->saddr, uh->dest, skb->dev->ifindex, 0,
+                              udptable, NULL);
+       if (!sk)
+               return NULL;
+
+       network_offset = skb_network_offset(skb);
+       transport_offset = skb_transport_offset(skb);
+
+       skb_reset_network_header(skb);
+
+       /* Network header needs to point to the outer IPv4 header inside ICMP */
+       skb_reset_network_header(skb);
+       iph = ip_hdr(skb);
+       /* Transport header needs to point to the UDP header */
+       skb_set_transport_header(skb, iph->ihl << 2);
+
+       up = udp_sk(sk);
+       lookup = READ_ONCE(up->encap_err_lookup);
+       if (!lookup || lookup(sk, skb))
+               sk = NULL;
+
+       skb_set_transport_header(skb, transport_offset);
+       skb_set_network_header(skb, network_offset);
+
+       return sk;
+}
+
 /*
  * This routine is called by the ICMP module when it gets some
  * sort of error condition.  If err < 0 then the socket should
@@ -603,6 +656,7 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct 
udp_table *udptable)
        struct udphdr *uh = (struct udphdr *)(skb->data+(iph->ihl<<2));
        const int type = icmp_hdr(skb)->type;
        const int code = icmp_hdr(skb)->code;
+       bool tunnel = false;
        struct sock *sk;
        int harderr;
        int err;
@@ -612,8 +666,15 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct 
udp_table *udptable)
                               iph->saddr, uh->source, skb->dev->ifindex,
                               inet_sdif(skb), udptable, NULL);
        if (!sk) {
-               __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
-               return; /* No socket for error */
+               /* No socket for error: try tunnels before discarding */
+               if (static_branch_unlikely(&udp_encap_needed_key))
+                       sk = __udp4_lib_err_encap(net, iph, uh, udptable, skb);
+
+               if (!sk) {
+                       __ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
+                       return;
+               }
+               tunnel = true;
        }
 
        err = 0;
@@ -656,6 +717,10 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct 
udp_table *udptable)
         *      RFC1122: OK.  Passes ICMP errors back to application, as per
         *      4.1.3.3.
         */
+       if (tunnel) {
+               /* ...not for tunnels though: we don't have a sending socket */
+               goto out;
+       }
        if (!inet->recverr) {
                if (!harderr || sk->sk_state != TCP_ESTABLISHED)
                        goto out;
@@ -1889,13 +1954,6 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
        return 0;
 }
 
-DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key);
-void udp_encap_enable(void)
-{
-       static_branch_enable(&udp_encap_needed_key);
-}
-EXPORT_SYMBOL(udp_encap_enable);
-
 /* returns:
  *  -1: error
  *   0: success
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index 6539ff15e9a3..d0c412fc56ad 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -68,6 +68,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket 
*sock,
 
        udp_sk(sk)->encap_type = cfg->encap_type;
        udp_sk(sk)->encap_rcv = cfg->encap_rcv;
+       udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup;
        udp_sk(sk)->encap_destroy = cfg->encap_destroy;
        udp_sk(sk)->gro_receive = cfg->gro_receive;
        udp_sk(sk)->gro_complete = cfg->gro_complete;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d2d97d07ef27..eebd90111646 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -463,6 +463,55 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, 
size_t len,
        goto try_again;
 }
 
+DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
+void udpv6_encap_enable(void)
+{
+       static_branch_enable(&udpv6_encap_needed_key);
+}
+EXPORT_SYMBOL(udpv6_encap_enable);
+
+/* Try to match ICMP errors to UDP tunnels by looking up a socket without
+ * reversing source and destination port: this will match tunnels that force 
the
+ * same destination port on both endpoints (e.g. VxLAN, GENEVE). Then ask the
+ * tunnel implementation to match the error against a valid association.
+ *
+ * Return the socket if we have a match.
+ */
+static struct sock *__udp6_lib_err_encap(struct net *net,
+                                        const struct ipv6hdr *hdr, int offset,
+                                        struct udphdr *uh,
+                                        struct udp_table *udptable,
+                                        struct sk_buff *skb)
+{
+       int (*lookup)(struct sock *sk, struct sk_buff *skb);
+       int network_offset, transport_offset;
+       struct udp_sock *up;
+       struct sock *sk;
+
+       sk = __udp6_lib_lookup(net, &hdr->daddr, uh->source,
+                              &hdr->saddr, uh->dest,
+                              inet6_iif(skb), 0, udptable, skb);
+       if (!sk)
+               return NULL;
+
+       network_offset = skb_network_offset(skb);
+       transport_offset = skb_transport_offset(skb);
+
+       /* Network header needs to point to the outer IPv6 header inside ICMP */
+       skb_reset_network_header(skb);
+       /* Transport header needs to point to the UDP header */
+       skb_set_transport_header(skb, offset);
+
+       up = udp_sk(sk);
+       lookup = READ_ONCE(up->encap_err_lookup);
+       if (!lookup || lookup(sk, skb))
+               sk = NULL;
+
+       skb_set_transport_header(skb, transport_offset);
+       skb_set_network_header(skb, network_offset);
+       return sk;
+}
+
 void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
                    u8 type, u8 code, int offset, __be32 info,
                    struct udp_table *udptable)
@@ -472,6 +521,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct 
inet6_skb_parm *opt,
        const struct in6_addr *saddr = &hdr->saddr;
        const struct in6_addr *daddr = &hdr->daddr;
        struct udphdr *uh = (struct udphdr *)(skb->data+offset);
+       bool tunnel = false;
        struct sock *sk;
        int harderr;
        int err;
@@ -480,9 +530,18 @@ void __udp6_lib_err(struct sk_buff *skb, struct 
inet6_skb_parm *opt,
        sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
                               inet6_iif(skb), inet6_sdif(skb), udptable, skb);
        if (!sk) {
-               __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
-                                 ICMP6_MIB_INERRORS);
-               return;
+               /* No socket for error: try tunnels before discarding */
+               if (static_branch_unlikely(&udpv6_encap_needed_key)) {
+                       sk = __udp6_lib_err_encap(net, hdr, offset, uh,
+                                                 udptable, skb);
+               }
+
+               if (!sk) {
+                       __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev),
+                                         ICMP6_MIB_INERRORS);
+                       return;
+               }
+               tunnel = true;
        }
 
        harderr = icmpv6_err_convert(type, code, &err);
@@ -496,10 +555,19 @@ void __udp6_lib_err(struct sk_buff *skb, struct 
inet6_skb_parm *opt,
                        harderr = 1;
        }
        if (type == NDISC_REDIRECT) {
-               ip6_sk_redirect(skb, sk);
+               if (tunnel) {
+                       ip6_redirect(skb, sock_net(sk), inet6_iif(skb),
+                                    sk->sk_mark, sk->sk_uid);
+               } else {
+                       ip6_sk_redirect(skb, sk);
+               }
                goto out;
        }
 
+       /* Tunnels don't have an application socket: don't pass errors back */
+       if (tunnel)
+               goto out;
+
        if (!np->recverr) {
                if (!harderr || sk->sk_state != TCP_ESTABLISHED)
                        goto out;
@@ -548,13 +616,6 @@ static __inline__ void udpv6_err(struct sk_buff *skb,
        __udp6_lib_err(skb, opt, type, code, offset, info, &udp_table);
 }
 
-DEFINE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
-void udpv6_encap_enable(void)
-{
-       static_branch_enable(&udpv6_encap_needed_key);
-}
-EXPORT_SYMBOL(udpv6_encap_enable);
-
 static int udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 {
        struct udp_sock *up = udp_sk(sk);
-- 
2.19.1

Reply via email to