Please queue up the following bug fixes for 3.0.x-stable Thanks!
>From a772cf60ab184e3a8028d0de2d30ec450fdca89a Mon Sep 17 00:00:00 2001 From: "David S. Miller" <[email protected]> Date: Mon, 6 Feb 2012 15:14:37 -0500 Subject: [PATCH 01/21] net: Make qdisc_skb_cb upper size bound explicit.
[ Upstream commit 16bda13d90c8d5da243e2cfa1677e62ecce26860 ] Just like skb->cb[], so that qdisc_skb_cb can be encapsulated inside of other data structures. This is intended to be used by IPoIB so that it can remember addressing information stored at hard_header_ops->create() time that it can fetch when the packet gets to the transmit routine. Signed-off-by: David S. Miller <[email protected]> --- include/net/sch_generic.h | 9 ++++++++- net/sched/sch_choke.c | 3 +-- net/sched/sch_netem.c | 3 +-- net/sched/sch_sfb.c | 3 +-- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index b931f02..f1fbe2d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -219,9 +219,16 @@ struct tcf_proto { struct qdisc_skb_cb { unsigned int pkt_len; - long data[]; + unsigned char data[24]; }; +static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) +{ + struct qdisc_skb_cb *qcb; + BUILD_BUG_ON(sizeof(skb->cb) < sizeof(unsigned int) + sz); + BUILD_BUG_ON(sizeof(qcb->data) < sz); +} + static inline int qdisc_qlen(struct Qdisc *q) { return q->q.qlen; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 06afbae..178ee83 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -225,8 +225,7 @@ struct choke_skb_cb { static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 69c35f6..87b9658 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -117,8 +117,7 @@ struct netem_skb_cb { static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 0a833d0..47ee29f 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -93,8 +93,7 @@ struct sfb_skb_cb { static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; } -- 1.7.7.6 >From 03dc6dfa53bde2c7f0def460ee52f5f01f6e9d44 Mon Sep 17 00:00:00 2001 From: Roland Dreier <[email protected]> Date: Tue, 7 Feb 2012 14:51:21 +0000 Subject: [PATCH 02/21] IPoIB: Stop lying about hard_header_len and use skb->cb to stash LL addresses [ Upstream commit 936d7de3d736e0737542641269436f4b5968e9ef ] Commit a0417fa3a18a ("net: Make qdisc_skb_cb upper size bound explicit.") made it possible for a netdev driver to use skb->cb between its header_ops.create method and its .ndo_start_xmit method. Use this in ipoib_hard_header() to stash away the LL address (GID + QPN), instead of the "ipoib_pseudoheader" hack. This allows IPoIB to stop lying about its hard_header_len, which will let us fix the L2 check for GRO. Signed-off-by: Roland Dreier <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- drivers/infiniband/ulp/ipoib/ipoib.h | 6 ++- drivers/infiniband/ulp/ipoib/ipoib_main.c | 55 ++++++++--------------- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 9 +--- 3 files changed, 24 insertions(+), 46 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 7b6985a..936804e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -44,6 +44,7 @@ #include <linux/mutex.h> #include <net/neighbour.h> +#include <net/sch_generic.h> #include <asm/atomic.h> @@ -117,8 +118,9 @@ struct ipoib_header { u16 reserved; }; -struct ipoib_pseudoheader { - u8 hwaddr[INFINIBAND_ALEN]; +struct ipoib_cb { + struct qdisc_skb_cb qdisc_cb; + u8 hwaddr[INFINIBAND_ALEN]; }; /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index a98c414..b811444 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -658,7 +658,7 @@ static void ipoib_path_lookup(struct sk_buff *skb, struct net_device *dev) } static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, - struct ipoib_pseudoheader *phdr) + struct ipoib_cb *cb) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_path *path; @@ -666,17 +666,15 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, spin_lock_irqsave(&priv->lock, flags); - path = __path_find(dev, phdr->hwaddr + 4); + path = __path_find(dev, cb->hwaddr + 4); if (!path || !path->valid) { int new_path = 0; if (!path) { - path = path_rec_create(dev, phdr->hwaddr + 4); + path = path_rec_create(dev, cb->hwaddr + 4); new_path = 1; } if (path) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof *phdr); __skb_queue_tail(&path->queue, skb); if (!path->query && path_rec_start(dev, path)) { @@ -700,12 +698,10 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, be16_to_cpu(path->pathrec.dlid)); spin_unlock_irqrestore(&priv->lock, flags); - ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr)); + ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); return; } else if ((path->query || !path_rec_start(dev, path)) && skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof *phdr); __skb_queue_tail(&path->queue, skb); } else { ++dev->stats.tx_dropped; @@ -774,16 +770,14 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) dev_kfree_skb_any(skb); } } else { - struct ipoib_pseudoheader *phdr = - (struct ipoib_pseudoheader *) skb->data; - skb_pull(skb, sizeof *phdr); + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; - if (phdr->hwaddr[4] == 0xff) { + if (cb->hwaddr[4] == 0xff) { /* Add in the P_Key for multicast*/ - phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff; - phdr->hwaddr[9] = priv->pkey & 0xff; + cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; + cb->hwaddr[9] = priv->pkey & 0xff; - ipoib_mcast_send(dev, phdr->hwaddr + 4, skb); + ipoib_mcast_send(dev, cb->hwaddr + 4, skb); } else { /* unicast GID -- should be ARP or RARP reply */ @@ -792,14 +786,14 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ipoib_warn(priv, "Unicast, no %s: type %04x, QPN %06x %pI6\n", skb_dst(skb) ? "neigh" : "dst", be16_to_cpup((__be16 *) skb->data), - IPOIB_QPN(phdr->hwaddr), - phdr->hwaddr + 4); + IPOIB_QPN(cb->hwaddr), + cb->hwaddr + 4); dev_kfree_skb_any(skb); ++dev->stats.tx_dropped; goto unlock; } - unicast_arp_send(skb, dev, phdr); + unicast_arp_send(skb, dev, cb); } } unlock: @@ -825,8 +819,6 @@ static int ipoib_hard_header(struct sk_buff *skb, const void *daddr, const void *saddr, unsigned len) { struct ipoib_header *header; - struct dst_entry *dst; - struct neighbour *n; header = (struct ipoib_header *) skb_push(skb, sizeof *header); @@ -834,18 +826,13 @@ static int ipoib_hard_header(struct sk_buff *skb, header->reserved = 0; /* - * If we don't have a neighbour structure, stuff the - * destination address onto the front of the skb so we can - * figure out where to send the packet later. + * If we don't have a dst_entry structure, stuff the + * destination address into skb->cb so we can figure out where + * to send the packet later. */ - dst = skb_dst(skb); - n = NULL; - if (dst) - n = dst_get_neighbour_raw(dst); - if ((!dst || !n) && daddr) { - struct ipoib_pseudoheader *phdr = - (struct ipoib_pseudoheader *) skb_push(skb, sizeof *phdr); - memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN); + if (!skb_dst(skb)) { + struct ipoib_cb *cb = (struct ipoib_cb *) skb->cb; + memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); } return 0; @@ -1021,11 +1008,7 @@ static void ipoib_setup(struct net_device *dev) dev->flags |= IFF_BROADCAST | IFF_MULTICAST; - /* - * We add in INFINIBAND_ALEN to allow for the destination - * address "pseudoheader" for skbs without neighbour struct. - */ - dev->hard_header_len = IPOIB_ENCAP_LEN + INFINIBAND_ALEN; + dev->hard_header_len = IPOIB_ENCAP_LEN; dev->addr_len = INFINIBAND_ALEN; dev->type = ARPHRD_INFINIBAND; dev->tx_queue_len = ipoib_sendq_size * 2; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index a8d2a89..8b63506 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -258,21 +258,14 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast, netif_tx_lock_bh(dev); while (!skb_queue_empty(&mcast->pkt_queue)) { struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue); - struct dst_entry *dst = skb_dst(skb); - struct neighbour *n = NULL; netif_tx_unlock_bh(dev); skb->dev = dev; - if (dst) - n = dst_get_neighbour_raw(dst); - if (!dst || !n) { - /* put pseudoheader back on for next time */ - skb_push(skb, sizeof (struct ipoib_pseudoheader)); - } if (dev_queue_xmit(skb)) ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n"); + netif_tx_lock_bh(dev); } netif_tx_unlock_bh(dev); -- 1.7.7.6 >From 4b2fedcbf6f9a933d4e2b74f2f20edcffe75fdd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <[email protected]> Date: Wed, 8 Feb 2012 08:51:50 +0000 Subject: [PATCH 03/21] gro: more generic L2 header check [ Upstream commit 5ca3b72c5da47d95b83857b768def6172fbc080a ] Shlomo Pongratz reported GRO L2 header check was suited for Ethernet only, and failed on IB/ipoib traffic. He provided a patch faking a zeroed header to let GRO aggregates frames. Roland Dreier, Herbert Xu, and others suggested we change GRO L2 header check to be more generic, ie not assuming L2 header is 14 bytes, but taking into account hard_header_len. __napi_gro_receive() has special handling for the common case (Ethernet) to avoid a memcmp() call and use an inline optimized function instead. Signed-off-by: Eric Dumazet <[email protected]> Reported-by: Shlomo Pongratz <[email protected]> Cc: Roland Dreier <[email protected]> Cc: Or Gerlitz <[email protected]> Cc: Herbert Xu <[email protected]> Tested-by: Sean Hefty <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/core/dev.c | 10 ++++++++-- 1 files changed, 8 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index f14f601..17fdbf8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3434,14 +3434,20 @@ static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; for (p = napi->gro_list; p; p = p->next) { unsigned long diffs; diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_gro_mac_header(skb), + maclen); NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } -- 1.7.7.6 >From 3fa54faac3a1c9c742a7b7a2c0ff21dd17a5b891 Mon Sep 17 00:00:00 2001 From: Thomas Graf <[email protected]> Date: Wed, 15 Feb 2012 04:09:46 +0000 Subject: [PATCH 04/21] veth: Enforce minimum size of VETH_INFO_PEER [ Upstream commit 237114384ab22c174ec4641e809f8e6cbcfce774 ] VETH_INFO_PEER carries struct ifinfomsg plus optional IFLA attributes. A minimal size of sizeof(struct ifinfomsg) must be enforced or we may risk accessing that struct beyond the limits of the netlink message. Signed-off-by: Thomas Graf <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- drivers/net/veth.c | 4 +++- 1 files changed, 3 insertions(+), 1 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 4bf7c6d..6c0a3b0 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -421,7 +421,9 @@ static void veth_dellink(struct net_device *dev, struct list_head *head) unregister_netdevice_queue(peer, head); } -static const struct nla_policy veth_policy[VETH_INFO_MAX + 1]; +static const struct nla_policy veth_policy[VETH_INFO_MAX + 1] = { + [VETH_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, +}; static struct rtnl_link_ops veth_link_ops = { .kind = DRV_NAME, -- 1.7.7.6 >From 883517479ee9727104b4ac4a3f8993355fdb2763 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <[email protected]> Date: Tue, 14 Feb 2012 10:27:09 +0000 Subject: [PATCH 05/21] 3c59x: shorten timer period for slave devices [ Upstream commit 3013dc0cceb9baaf25d5624034eeaa259bf99004 ] Jean Delvare reported bonding on top of 3c59x adapters was not detecting network cable removal fast enough. 3c59x indeed uses a 60 seconds timer to check link status if carrier is on, and 5 seconds if carrier is off. This patch reduces timer period to 5 seconds if device is a bonding slave. Reported-by: Jean Delvare <[email protected]> Acked-by: Jean Delvare <[email protected]> Acked-by: Steffen Klassert <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- drivers/net/3c59x.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c index 8cc2256..41afc40 100644 --- a/drivers/net/3c59x.c +++ b/drivers/net/3c59x.c @@ -1842,7 +1842,7 @@ vortex_timer(unsigned long data) ok = 1; } - if (!netif_carrier_ok(dev)) + if (dev->flags & IFF_SLAVE || !netif_carrier_ok(dev)) next_tick = 5*HZ; if (vp->medialock) -- 1.7.7.6 >From f442924080f4730ad3881a07077480e4ea693f66 Mon Sep 17 00:00:00 2001 From: Ben Greear <[email protected]> Date: Tue, 27 Sep 2011 15:16:08 -0400 Subject: [PATCH 06/21] ipv6-multicast: Fix memory leak in input path. [ Upstream commit 2015de5fe2a47086a3260802275932bfd810884e ] Have to free the skb before returning if we fail the fib lookup. Signed-off-by: Ben Greear <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv6/ip6mr.c | 4 +++- 1 files changed, 3 insertions(+), 1 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 82a8099..450a1ff 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2051,8 +2051,10 @@ int ip6_mr_input(struct sk_buff *skb) int err; err = ip6mr_fib_lookup(net, &fl6, &mrt); - if (err < 0) + if (err < 0) { + kfree_skb(skb); return err; + } read_lock(&mrt_lock); cache = ip6mr_cache_find(mrt, -- 1.7.7.6 >From 96be35d1f8e04252c85db66eb87d48c91ebb1ba3 Mon Sep 17 00:00:00 2001 From: Ben Greear <[email protected]> Date: Fri, 23 Sep 2011 13:11:01 +0000 Subject: [PATCH 07/21] ipv6-multicast: Fix memory leak in IPv6 multicast. [ Upstream commit 67928c4041606f02725f3c95c4c0404e4532df1b ] If reg_vif_xmit cannot find a routing entry, be sure to free the skb before returning the error. Signed-off-by: Ben Greear <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv6/ip6mr.c | 4 +++- 1 files changed, 3 insertions(+), 1 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 450a1ff..86e3cc1 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -696,8 +696,10 @@ static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, int err; err = ip6mr_fib_lookup(net, &fl6, &mrt); - if (err < 0) + if (err < 0) { + kfree_skb(skb); return err; + } read_lock(&mrt_lock); dev->stats.tx_bytes += skb->len; -- 1.7.7.6 >From bcf91bc1ef139e385d76fabbc9856675b973e3e6 Mon Sep 17 00:00:00 2001 From: Li Wei <[email protected]> Date: Tue, 8 Nov 2011 21:39:28 +0000 Subject: [PATCH 08/21] ipv4: fix for ip_options_rcv_srr() daddr update. [ Upstream commit b12f62efb8ec0b9523bdb6c2d412c07193086de9 ] When opt->srr_is_hit is set skb_rtable(skb) has been updated for 'nexthop' and iph->daddr should always equals to skb_rtable->rt_dst holds, We need update iph->daddr either. Signed-off-by: Li Wei <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/ip_options.c | 1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ec93335..05d20cc 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -640,6 +640,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) } if (srrptr <= srrspace) { opt->srr_is_hit = 1; + iph->daddr = nexthop; opt->is_changed = 1; } return 0; -- 1.7.7.6 >From ce64e033c5df5fd0828ba839b20b561eb21bdf7d Mon Sep 17 00:00:00 2001 From: Li Wei <[email protected]> Date: Tue, 22 Nov 2011 23:33:10 +0000 Subject: [PATCH 09/21] ipv4: Save nexthop address of LSRR/SSRR option to IPCB. [ Upstream commit ac8a48106be49c422575ddc7531b776f8eb49610 ] We can not update iph->daddr in ip_options_rcv_srr(), It is too early. When some exception ocurred later (eg. in ip_forward() when goto sr_failed) we need the ip header be identical to the original one as ICMP need it. Add a field 'nexthop' in struct ip_options to save nexthop of LSRR or SSRR option. Signed-off-by: Li Wei <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- include/net/inet_sock.h | 2 ++ net/ipv4/ip_forward.c | 2 +- net/ipv4/ip_options.c | 5 +++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index caaff5f..14dd9c7 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -31,6 +31,7 @@ /** struct ip_options - IP Options * * @faddr - Saved first hop address + * @nexthop - Saved nexthop address in LSRR and SSRR * @is_data - Options in __data, rather than skb * @is_strictroute - Strict source route * @srr_is_hit - Packet destination addr was our one @@ -41,6 +42,7 @@ */ struct ip_options { __be32 faddr; + __be32 nexthop; unsigned char optlen; unsigned char srr; unsigned char rr; diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 3b34d1c..29a07b6 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -84,7 +84,7 @@ int ip_forward(struct sk_buff *skb) rt = skb_rtable(skb); - if (opt->is_strictroute && ip_hdr(skb)->daddr != rt->rt_gateway) + if (opt->is_strictroute && opt->nexthop != rt->rt_gateway) goto sr_failed; if (unlikely(skb->len > dst_mtu(&rt->dst) && !skb_is_gso(skb) && diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 05d20cc..1e60f76 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -568,12 +568,13 @@ void ip_forward_options(struct sk_buff *skb) ) { if (srrptr + 3 > srrspace) break; - if (memcmp(&ip_hdr(skb)->daddr, &optptr[srrptr-1], 4) == 0) + if (memcmp(&opt->nexthop, &optptr[srrptr-1], 4) == 0) break; } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; ip_rt_get_source(&optptr[srrptr-1], skb, rt); + ip_hdr(skb)->daddr = opt->nexthop; optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); @@ -640,7 +641,7 @@ int ip_options_rcv_srr(struct sk_buff *skb) } if (srrptr <= srrspace) { opt->srr_is_hit = 1; - iph->daddr = nexthop; + opt->nexthop = nexthop; opt->is_changed = 1; } return 0; -- 1.7.7.6 >From af2f952e45d6ea905810b87deb99775ffb357030 Mon Sep 17 00:00:00 2001 From: Li Wei <[email protected]> Date: Thu, 9 Feb 2012 21:15:25 +0000 Subject: [PATCH 10/21] ipv4: Fix wrong order of ip_rt_get_source() and update iph->daddr. [ Upstream commit 5dc7883f2a7c25f8df40d7479687153558cd531b ] This patch fix a bug which introduced by commit ac8a4810 (ipv4: Save nexthop address of LSRR/SSRR option to IPCB.).In that patch, we saved the nexthop of SRR in ip_option->nexthop and update iph->daddr until we get to ip_forward_options(), but we need to update it before ip_rt_get_source(), otherwise we may get a wrong src. Signed-off-by: Li Wei <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/ip_options.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1e60f76..42dd1a9 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -573,8 +573,8 @@ void ip_forward_options(struct sk_buff *skb) } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; - ip_rt_get_source(&optptr[srrptr-1], skb, rt); ip_hdr(skb)->daddr = opt->nexthop; + ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); -- 1.7.7.6 >From ada3a2c294826d5b718b52ecdecfc4149cc67d36 Mon Sep 17 00:00:00 2001 From: Julian Anastasov <[email protected]> Date: Sat, 4 Feb 2012 13:04:46 +0000 Subject: [PATCH 11/21] ipv4: reset flowi parameters on route connect [ Upstream commit e6b45241c57a83197e5de9166b3b0d32ac562609 ] Eric Dumazet found that commit 813b3b5db83 (ipv4: Use caller's on-stack flowi as-is in output route lookups.) that comes in 3.0 added a regression. The problem appears to be that resulting flowi4_oif is used incorrectly as input parameter to some routing lookups. The result is that when connecting to local port without listener if the IP address that is used is not on a loopback interface we incorrectly assign RTN_UNICAST to the output route because no route is matched by oif=lo. The RST packet can not be sent immediately by tcp_v4_send_reset because it expects RTN_LOCAL. So, change ip_route_connect and ip_route_newports to update the flowi4 fields that are input parameters because we do not want unnecessary binding to oif. To make it clear what are the input parameters that can be modified during lookup and to show which fields of floiw4 are reused add a new function to update the flowi4 structure: flowi4_update_output. Thanks to Yurij M. Plotnikov for providing a bug report including a program to reproduce the problem. Thanks to Eric Dumazet for tracking the problem down to tcp_v4_send_reset and providing initial fix. Reported-by: Yurij M. Plotnikov <[email protected]> Signed-off-by: Julian Anastasov <[email protected]> Acked-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- include/net/flow.h | 10 ++++++++++ include/net/route.h | 4 ++++ 2 files changed, 14 insertions(+), 0 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index 32359fd..e37cfda 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -90,6 +90,16 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, fl4->fl4_dport = dport; fl4->fl4_sport = sport; } + +/* Reset some input parameters after previous lookup */ +static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __u8 tos, + __be32 daddr, __be32 saddr) +{ + fl4->flowi4_oif = oif; + fl4->flowi4_tos = tos; + fl4->daddr = daddr; + fl4->saddr = saddr; +} struct flowi6 { diff --git a/include/net/route.h b/include/net/route.h index db7b343..5d7aae4 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -270,6 +270,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, if (IS_ERR(rt)) return rt; ip_rt_put(rt); + flowi4_update_output(fl4, oif, tos, fl4->daddr, fl4->saddr); } security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); return ip_route_output_flow(net, fl4, sk); @@ -284,6 +285,9 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable fl4->fl4_dport = dport; fl4->fl4_sport = sport; ip_rt_put(rt); + flowi4_update_output(fl4, sk->sk_bound_dev_if, + RT_CONN_FLAGS(sk), fl4->daddr, + fl4->saddr); security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); return ip_route_output_flow(sock_net(sk), fl4, sk); } -- 1.7.7.6 >From 2d877c77ab12236d680b0b11be95af993c5fbd17 Mon Sep 17 00:00:00 2001 From: Thomas Graf <[email protected]> Date: Fri, 10 Feb 2012 04:07:11 +0000 Subject: [PATCH 12/21] net: Don't proxy arp respond if iif == rt->dst.dev if private VLAN is disabled [ Upstream commit 70620c46ac2b45c24b0f22002fdf5ddd1f7daf81 ] Commit 653241 (net: RFC3069, private VLAN proxy arp support) changed the behavior of arp proxy to send arp replies back out on the interface the request came in even if the private VLAN feature is disabled. Previously we checked rt->dst.dev != skb->dev for in scenarios, when proxy arp is enabled on for the netdevice and also when individual proxy neighbour entries have been added. This patch adds the check back for the pneigh_lookup() scenario. Signed-off-by: Thomas Graf <[email protected]> Acked-by: Jesper Dangaard Brouer <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/arp.c | 3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1d5675e..d8f852d 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -906,7 +906,8 @@ static int arp_process(struct sk_buff *skb) if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || - pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { + (rt->dst.dev != dev && + pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); -- 1.7.7.6 >From 187db713d22e1c648991e335400c4a65acfb3fd9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <[email protected]> Date: Tue, 14 Feb 2012 10:11:59 +0000 Subject: [PATCH 13/21] netpoll: netpoll_poll_dev() should access dev->flags [ Upstream commit 58e05f357a039a94aa36475f8c110256f693a239 ] commit 5a698af53f (bond: service netpoll arp queue on master device) tested IFF_SLAVE flag against dev->priv_flags instead of dev->flags Signed-off-by: Eric Dumazet <[email protected]> Cc: WANG Cong <[email protected]> Acked-by: Neil Horman <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/core/netpoll.c | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 18d9cbd..05db410 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -193,7 +193,7 @@ void netpoll_poll_dev(struct net_device *dev) poll_napi(dev); - if (dev->priv_flags & IFF_SLAVE) { + if (dev->flags & IFF_SLAVE) { if (dev->npinfo) { struct net_device *bond_dev = dev->master; struct sk_buff *skb; -- 1.7.7.6 >From 3150cc0db3cf3d7a9e65669793fd9e826b29e921 Mon Sep 17 00:00:00 2001 From: Hagen Paul Pfeifer <[email protected]> Date: Wed, 4 Jan 2012 17:35:26 +0000 Subject: [PATCH 14/21] net_sched: Bug in netem reordering [ Upstream commit eb10192447370f19a215a8c2749332afa1199d46 ] Not now, but it looks you are correct. q->qdisc is NULL until another additional qdisc is attached (beside tfifo). See 50612537e9ab2969312. The following patch should work. From: Hagen Paul Pfeifer <[email protected]> netem: catch NULL pointer by updating the real qdisc statistic Reported-by: Vijay Subramanian <[email protected]> Signed-off-by: Hagen Paul Pfeifer <[email protected]> Acked-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/sched/sch_netem.c | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 87b9658..2f68459 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -381,8 +381,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->counter = 0; __skb_queue_head(&q->qdisc->q, skb); - q->qdisc->qstats.backlog += qdisc_pkt_len(skb); - q->qdisc->qstats.requeues++; + sch->qstats.backlog += qdisc_pkt_len(skb); + sch->qstats.requeues++; ret = NET_XMIT_SUCCESS; } -- 1.7.7.6 >From ae29e0eebc59e4eb5c47a5f318e9cd75abc463c7 Mon Sep 17 00:00:00 2001 From: David Lv <[email protected]> Date: Sat, 4 Feb 2012 23:22:26 +0000 Subject: [PATCH 15/21] via-velocity: S3 resume fix. [ Upstream commit b530b1930bbd9d005345133f0ff0c556d2a52b19 ] Initially diagnosed on Ubuntu 11.04 with kernel 2.6.38. velocity_close is not called during a suspend / resume cycle in this driver and it has no business playing directly with power states. Signed-off-by: David Lv <[email protected]> Acked-by: Francois Romieu <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- drivers/net/via-velocity.c | 3 --- 1 files changed, 0 insertions(+), 3 deletions(-) diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c index 06daa9d..c7e4934 100644 --- a/drivers/net/via-velocity.c +++ b/drivers/net/via-velocity.c @@ -2513,9 +2513,6 @@ static int velocity_close(struct net_device *dev) if (dev->irq != 0) free_irq(dev->irq, dev); - /* Power down the chip */ - pci_set_power_state(vptr->pdev, PCI_D3hot); - velocity_free_rings(vptr); vptr->flags &= (~VELOCITY_FLAGS_OPENED); -- 1.7.7.6 >From f96e07ebea3efd0dc063d7e4465905fd0e0a09f4 Mon Sep 17 00:00:00 2001 From: Shawn Lu <[email protected]> Date: Sat, 4 Feb 2012 12:38:09 +0000 Subject: [PATCH 16/21] tcp_v4_send_reset: binding oif to iif in no sock case [ Upstream commit e2446eaab5585555a38ea0df4e01ff313dbb4ac9 ] Binding RST packet outgoing interface to incoming interface for tcp v4 when there is no socket associate with it. when sk is not NULL, using sk->sk_bound_dev_if instead. (suggested by Eric Dumazet). This has few benefits: 1. tcp_v6_send_reset already did that. 2. This helps tcp connect with SO_BINDTODEVICE set. When connection is lost, we still able to sending out RST using same interface. 3. we are sending reply, it is most likely to be succeed if iif is used Signed-off-by: Shawn Lu <[email protected]> Acked-by: Eric Dumazet <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/tcp_ipv4.c | 5 +++++ 1 files changed, 5 insertions(+), 0 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 53b0125..04c6592 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -650,6 +650,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; + /* When socket is gone, all binding information is lost. + * routing might fail in this case. using iif for oif to + * make sure we can deliver it + */ + arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); net = dev_net(skb_dst(skb)->dev); ip_send_reply(net->ipv4.tcp_sock, skb, ip_hdr(skb)->saddr, -- 1.7.7.6 >From fc85ddd48982affec7591f56861d286ef6ff259f Mon Sep 17 00:00:00 2001 From: Neal Cardwell <[email protected]> Date: Sun, 12 Feb 2012 18:37:09 +0000 Subject: [PATCH 17/21] tcp: allow tcp_sacktag_one() to tag ranges not aligned with skbs [ Upstream commit cc9a672ee522d4805495b98680f4a3db5d0a0af9 ] This commit allows callers of tcp_sacktag_one() to pass in sequence ranges that do not align with skb boundaries, as tcp_shifted_skb() needs to do in an upcoming fix in this patch series. In fact, now tcp_sacktag_one() does not need to depend on an input skb at all, which makes its semantics and dependencies more clear. Signed-off-by: Neal Cardwell <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++-------------- 1 files changed, 22 insertions(+), 14 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c68040f..af41044 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1289,25 +1289,26 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } -static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, - struct tcp_sacktag_state *state, +/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ +static u8 tcp_sacktag_one(struct sock *sk, + struct tcp_sacktag_state *state, u8 sacked, + u32 start_seq, u32 end_seq, int dup_sack, int pcount) { struct tcp_sock *tp = tcp_sk(sk); - u8 sacked = TCP_SKB_CB(skb)->sacked; int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans && - after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) state->reord = min(fack_count, state->reord); } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + if (!after(end_seq, tp->snd_una)) return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { @@ -1326,13 +1327,13 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ - if (before(TCP_SKB_CB(skb)->seq, + if (before(start_seq, tcp_highest_sack_seq(tp))) state->reord = min(fack_count, state->reord); /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + if (!after(end_seq, tp->frto_highmark)) state->flag |= FLAG_ONLY_ORIG_SACKED; } @@ -1350,8 +1351,7 @@ static u8 tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->lost_skb_hint)->seq)) + before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; if (fack_count > tp->fackets_out) @@ -1407,7 +1407,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, } /* We discard results */ - tcp_sacktag_one(skb, sk, state, dup_sack, pcount); + tcp_sacktag_one(sk, state, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + dup_sack, pcount); /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1646,10 +1650,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, break; if (in_sack) { - TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, - state, - dup_sack, - tcp_skb_pcount(skb)); + TCP_SKB_CB(skb)->sacked = + tcp_sacktag_one(sk, + state, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + dup_sack, + tcp_skb_pcount(skb)); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) -- 1.7.7.6 >From 8e0325839acf261bb1a0e515d4145a9139a12afa Mon Sep 17 00:00:00 2001 From: Neal Cardwell <[email protected]> Date: Sun, 12 Feb 2012 18:37:10 +0000 Subject: [PATCH 18/21] tcp: fix range tcp_shifted_skb() passes to tcp_sacktag_one() [ Upstream commit daef52bab1fd26e24e8e9578f8fb33ba1d0cb412 ] Fix the newly-SACKed range to be the range of newly-shifted bytes. Previously - since 832d11c5cd076abc0aa1eaf7be96c81d1a59ce41 - tcp_shifted_skb() incorrectly called tcp_sacktag_one() with the start and end sequence numbers of the skb it passes in set to the range just beyond the range that is newly-SACKed. This commit also removes a special-case adjustment to lost_cnt_hint in tcp_shifted_skb() since the pre-existing adjustment of lost_cnt_hint in tcp_sacktag_one() now properly handles this things now that the correct start sequence number is passed in. Signed-off-by: Neal Cardwell <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/tcp_input.c | 19 ++++++++++--------- 1 files changed, 10 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index af41044..78689e5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1370,6 +1370,9 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; } +/* Shift newly-SACKed bytes from this skb to the immediately previous + * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. + */ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, @@ -1377,12 +1380,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev = tcp_write_queue_prev(sk, skb); + u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ + u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ BUG_ON(!pcount); - if (skb == tp->lost_skb_hint) - tp->lost_cnt_hint += pcount; - TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; @@ -1406,12 +1408,11 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, skb_shinfo(skb)->gso_type = 0; } - /* We discard results */ - tcp_sacktag_one(sk, state, - TCP_SKB_CB(skb)->sacked, - TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(skb)->end_seq, - dup_sack, pcount); + /* Adjust counters and hints for the newly sacked sequence range but + * discard the return value since prev is already marked. + */ + tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, + start_seq, end_seq, dup_sack, pcount); /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); -- 1.7.7.6 >From 0d54dedba76b4142642d9f95b5548d75af5b3abb Mon Sep 17 00:00:00 2001 From: Neal Cardwell <[email protected]> Date: Mon, 13 Feb 2012 20:22:08 +0000 Subject: [PATCH 19/21] tcp: fix tcp_shifted_skb() adjustment of lost_cnt_hint for FACK [ Upstream commit 0af2a0d0576205dda778d25c6c344fc6508fc81d ] This commit ensures that lost_cnt_hint is correctly updated in tcp_shifted_skb() for FACK TCP senders. The lost_cnt_hint adjustment in tcp_sacktag_one() only applies to non-FACK senders, so FACK senders need their own adjustment. This applies the spirit of 1e5289e121372a3494402b1b131b41bfe1cf9b7f - except now that the sequence range passed into tcp_sacktag_one() is correct we need only have a special case adjustment for FACK. Signed-off-by: Neal Cardwell <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/tcp_input.c | 4 ++++ 1 files changed, 4 insertions(+), 0 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 78689e5..ee08f11f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1385,6 +1385,10 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!pcount); + /* Adjust hint for FACK. Non-FACK is handled in tcp_sacktag_one(). */ + if (tcp_is_fack(tp) && (skb == tp->lost_skb_hint)) + tp->lost_cnt_hint += pcount; + TCP_SKB_CB(prev)->end_seq += shifted; TCP_SKB_CB(skb)->seq += shifted; -- 1.7.7.6 >From f07f0377c5f8e71dff99288cb9f5cb8029a138ca Mon Sep 17 00:00:00 2001 From: Flavio Leitner <[email protected]> Date: Mon, 24 Oct 2011 02:56:38 -0400 Subject: [PATCH 20/21] route: fix ICMP redirect validation [ Upstream commit 7cc9150ebe8ec06cafea9f1c10d92ddacf88d8ae ] The commit f39925dbde7788cfb96419c0f092b086aa325c0f (ipv4: Cache learned redirect information in inetpeer.) removed some ICMP packet validations which are required by RFC 1122, section 3.2.2.2: ... A Redirect message SHOULD be silently discarded if the new gateway address it specifies is not on the same connected (sub-) net through which the Redirect arrived [INTRO:2, Appendix A], or if the source of the Redirect is not the current first-hop gateway for the specified destination (see Section 3.3.1). Signed-off-by: Flavio Leitner <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/route.c | 36 +++++++++++++++++++++++++++++++----- 1 files changed, 31 insertions(+), 5 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 65ff2e5..f881be2 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1373,7 +1373,12 @@ static void rt_del(unsigned hash, struct rtable *rt) void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { + int s, i; struct in_device *in_dev = __in_dev_get_rcu(dev); + struct rtable *rt; + __be32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; + struct flowi4 fl4; struct inet_peer *peer; struct net *net; @@ -1396,13 +1401,34 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - peer = inet_getpeer_v4(daddr, 1); - if (peer) { - peer->redirect_learned.a4 = new_gw; + memset(&fl4, 0, sizeof(fl4)); + fl4.daddr = daddr; + for (s = 0; s < 2; s++) { + for (i = 0; i < 2; i++) { + fl4.flowi4_oif = ikeys[i]; + fl4.saddr = skeys[s]; + rt = __ip_route_output_key(net, &fl4); + if (IS_ERR(rt)) + continue; - inet_putpeer(peer); + if (rt->dst.error || rt->dst.dev != dev || + rt->rt_gateway != old_gw) { + ip_rt_put(rt); + continue; + } - atomic_inc(&__rt_peer_genid); + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); + + peer = rt->peer; + if (peer) { + peer->redirect_learned.a4 = new_gw; + atomic_inc(&__rt_peer_genid); + } + + ip_rt_put(rt); + return; + } } return; -- 1.7.7.6 >From 09a3e9ea78f91aef27c2dfaca246b0c7d1a5a3d8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <[email protected]> Date: Fri, 18 Nov 2011 15:24:32 -0500 Subject: [PATCH 21/21] ipv4: fix redirect handling [ Upstream commit 9cc20b268a5a14f5e57b8ad405a83513ab0d78dc ] commit f39925dbde77 (ipv4: Cache learned redirect information in inetpeer.) introduced a regression in ICMP redirect handling. It assumed ipv4_dst_check() would be called because all possible routes were attached to the inetpeer we modify in ip_rt_redirect(), but thats not true. commit 7cc9150ebe (route: fix ICMP redirect validation) tried to fix this but solution was not complete. (It fixed only one route) So we must lookup existing routes (including different TOS values) and call check_peer_redir() on them. Reported-by: Ivan Zahariev <[email protected]> Signed-off-by: Eric Dumazet <[email protected]> CC: Flavio Leitner <[email protected]> Signed-off-by: David S. Miller <[email protected]> --- net/ipv4/route.c | 109 ++++++++++++++++++++++++++++------------------------- 1 files changed, 58 insertions(+), 51 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index f881be2..6b95f74 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1369,16 +1369,41 @@ static void rt_del(unsigned hash, struct rtable *rt) spin_unlock_bh(rt_hash_lock_addr(hash)); } +static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) +{ + struct rtable *rt = (struct rtable *) dst; + __be32 orig_gw = rt->rt_gateway; + struct neighbour *n, *old_n; + + dst_confirm(&rt->dst); + + rt->rt_gateway = peer->redirect_learned.a4; + n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway); + if (IS_ERR(n)) + return PTR_ERR(n); + old_n = xchg(&rt->dst._neighbour, n); + if (old_n) + neigh_release(old_n); + if (!n || !(n->nud_state & NUD_VALID)) { + if (n) + neigh_event_send(n, NULL); + rt->rt_gateway = orig_gw; + return -EAGAIN; + } else { + rt->rt_flags |= RTCF_REDIRECTED; + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); + } + return 0; +} + /* called in rcu_read_lock() section */ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 saddr, struct net_device *dev) { int s, i; struct in_device *in_dev = __in_dev_get_rcu(dev); - struct rtable *rt; __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; - struct flowi4 fl4; struct inet_peer *peer; struct net *net; @@ -1401,33 +1426,42 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, goto reject_redirect; } - memset(&fl4, 0, sizeof(fl4)); - fl4.daddr = daddr; for (s = 0; s < 2; s++) { for (i = 0; i < 2; i++) { - fl4.flowi4_oif = ikeys[i]; - fl4.saddr = skeys[s]; - rt = __ip_route_output_key(net, &fl4); - if (IS_ERR(rt)) - continue; - - if (rt->dst.error || rt->dst.dev != dev || - rt->rt_gateway != old_gw) { - ip_rt_put(rt); - continue; - } + unsigned int hash; + struct rtable __rcu **rthp; + struct rtable *rt; + + hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); + + rthp = &rt_hash_table[hash].chain; + + while ((rt = rcu_dereference(*rthp)) != NULL) { + rthp = &rt->dst.rt_next; + + if (rt->rt_key_dst != daddr || + rt->rt_key_src != skeys[s] || + rt->rt_oif != ikeys[i] || + rt_is_input_route(rt) || + rt_is_expired(rt) || + !net_eq(dev_net(rt->dst.dev), net) || + rt->dst.error || + rt->dst.dev != dev || + rt->rt_gateway != old_gw) + continue; - if (!rt->peer) - rt_bind_peer(rt, rt->rt_dst, 1); + if (!rt->peer) + rt_bind_peer(rt, rt->rt_dst, 1); - peer = rt->peer; - if (peer) { - peer->redirect_learned.a4 = new_gw; - atomic_inc(&__rt_peer_genid); + peer = rt->peer; + if (peer) { + if (peer->redirect_learned.a4 != new_gw) { + peer->redirect_learned.a4 = new_gw; + atomic_inc(&__rt_peer_genid); + } + check_peer_redir(&rt->dst, peer); + } } - - ip_rt_put(rt); - return; } } return; @@ -1715,33 +1749,6 @@ static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) } } -static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) -{ - struct rtable *rt = (struct rtable *) dst; - __be32 orig_gw = rt->rt_gateway; - struct neighbour *n, *old_n; - - dst_confirm(&rt->dst); - - rt->rt_gateway = peer->redirect_learned.a4; - n = __arp_bind_neighbour(&rt->dst, rt->rt_gateway); - if (IS_ERR(n)) - return PTR_ERR(n); - old_n = xchg(&rt->dst._neighbour, n); - if (old_n) - neigh_release(old_n); - if (!n || !(n->nud_state & NUD_VALID)) { - if (n) - neigh_event_send(n, NULL); - rt->rt_gateway = orig_gw; - return -EAGAIN; - } else { - rt->rt_flags |= RTCF_REDIRECTED; - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); - } - return 0; -} - static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) { struct rtable *rt = (struct rtable *) dst; -- 1.7.7.6
