Some operators prefer IPv6 path selection to use a standard 5-tuple
hash rather than just an L3 hash with the flow the label. To that end
add support to IPv6 for multipath hash policy similar to bf4e0a3db97eb
("net: ipv4: add support for ECMP hash policy choice"). The default
is still L3 which covers source and destination addresses along with
flow label and IPv6 protocol.

Signed-off-by: David Ahern <dsah...@gmail.com>
---
 Documentation/networking/ip-sysctl.txt |  7 ++++
 include/net/ip6_route.h                |  3 +-
 include/net/netevent.h                 |  1 +
 include/net/netns/ipv6.h               |  1 +
 net/ipv6/icmp.c                        |  2 +-
 net/ipv6/route.c                       | 64 ++++++++++++++++++++++++++--------
 net/ipv6/sysctl_net_ipv6.c             | 26 ++++++++++++++
 7 files changed, 87 insertions(+), 17 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index a553d4e4a0fb..783675a730e5 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -1363,6 +1363,13 @@ flowlabel_reflect - BOOLEAN
        FALSE: disabled
        Default: FALSE
 
+fib_multipath_hash_policy - INTEGER
+       Controls which hash policy to use for multipath routes.
+       Default: 0 (Layer 3)
+       Possible values:
+       0 - Layer 3 (source and destination addresses plus flow label)
+       1 - Layer 4 (standard 5-tuple)
+
 anycast_src_echo_reply - BOOLEAN
        Controls the use of anycast addresses as source addresses for ICMPv6
        echo reply
diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h
index 27d23a65f3cd..f13657a62e5c 100644
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -127,7 +127,8 @@ static inline int ip6_route_get_saddr(struct net *net, 
struct rt6_info *rt,
 
 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
                            const struct in6_addr *saddr, int oif, int flags);
-u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb);
+u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
+                      const struct sk_buff *skb);
 
 struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6);
 
diff --git a/include/net/netevent.h b/include/net/netevent.h
index baee605a94ab..d9918261701c 100644
--- a/include/net/netevent.h
+++ b/include/net/netevent.h
@@ -27,6 +27,7 @@ enum netevent_notif_type {
        NETEVENT_REDIRECT,         /* arg is struct netevent_redirect ptr */
        NETEVENT_DELAY_PROBE_TIME_UPDATE, /* arg is struct neigh_parms ptr */
        NETEVENT_IPV4_MPATH_HASH_UPDATE, /* arg is struct net ptr */
+       NETEVENT_IPV6_MPATH_HASH_UPDATE, /* arg is struct net ptr */
 };
 
 int register_netevent_notifier(struct notifier_block *nb);
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 987cc4569cb8..6b0de3b71bbf 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -28,6 +28,7 @@ struct netns_sysctl_ipv6 {
        int ip6_rt_gc_elasticity;
        int ip6_rt_mtu_expires;
        int ip6_rt_min_advmss;
+       int multipath_hash_policy;
        int flowlabel_consistency;
        int auto_flowlabels;
        int icmpv6_time;
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 4fa4f1b150a4..f53c14390d9f 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -522,7 +522,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 
code, __u32 info,
        fl6.fl6_icmp_type = type;
        fl6.fl6_icmp_code = code;
        fl6.flowi6_uid = sock_net_uid(net, NULL);
-       fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
+       fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb);
        security_skb_classify_flow(skb, flowi6_to_flowi(&fl6));
 
        sk = icmpv6_xmit_lock(net);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ab0de69ec67d..5c4481e4f3e1 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -450,7 +450,8 @@ static bool rt6_check_expired(const struct rt6_info *rt)
        return false;
 }
 
-static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
+static struct rt6_info *rt6_multipath_select(const struct net *net,
+                                            struct rt6_info *match,
                                             struct flowi6 *fl6, int oif,
                                             int strict)
 {
@@ -460,7 +461,7 @@ static struct rt6_info *rt6_multipath_select(struct 
rt6_info *match,
         * case it will always be non-zero. Otherwise now is the time to do it.
         */
        if (!fl6->mp_hash)
-               fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
+               fl6->mp_hash = rt6_multipath_hash(net, fl6, NULL);
 
        if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
                return match;
@@ -929,7 +930,7 @@ static struct rt6_info *ip6_pol_route_lookup(struct net 
*net,
                rt = rt6_device_match(net, rt, &fl6->saddr,
                                      fl6->flowi6_oif, flags);
                if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
-                       rt = rt6_multipath_select(rt, fl6,
+                       rt = rt6_multipath_select(net, rt, fl6,
                                                  fl6->flowi6_oif, flags);
        }
        if (rt == net->ipv6.ip6_null_entry) {
@@ -1669,7 +1670,7 @@ struct rt6_info *ip6_pol_route(struct net *net, struct 
fib6_table *table,
 redo_rt6_select:
        rt = rt6_select(net, fn, oif, strict);
        if (rt->rt6i_nsiblings)
-               rt = rt6_multipath_select(rt, fl6, oif, strict);
+               rt = rt6_multipath_select(net, rt, fl6, oif, strict);
        if (rt == net->ipv6.ip6_null_entry) {
                fn = fib6_backtrack(fn, &fl6->saddr);
                if (fn)
@@ -1819,20 +1820,53 @@ static void ip6_multipath_l3_keys(const struct sk_buff 
*skb,
 }
 
 /* if skb is set it will be used and fl6 can be NULL */
-u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
+u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
+                      const struct sk_buff *skb)
 {
        struct flow_keys hash_keys;
        u32 mhash;
 
-       memset(&hash_keys, 0, sizeof(hash_keys));
-       hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
-       if (skb) {
-               ip6_multipath_l3_keys(skb, &hash_keys);
-       } else {
-               hash_keys.addrs.v6addrs.src = fl6->saddr;
-               hash_keys.addrs.v6addrs.dst = fl6->daddr;
-               hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
-               hash_keys.basic.ip_proto = fl6->flowi6_proto;
+       switch (net->ipv6.sysctl.multipath_hash_policy) {
+       case 0:
+               memset(&hash_keys, 0, sizeof(hash_keys));
+               hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+               if (skb) {
+                       ip6_multipath_l3_keys(skb, &hash_keys);
+               } else {
+                       hash_keys.addrs.v6addrs.src = fl6->saddr;
+                       hash_keys.addrs.v6addrs.dst = fl6->daddr;
+                       hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
+                       hash_keys.basic.ip_proto = fl6->flowi6_proto;
+               }
+               break;
+       case 1:
+               if (skb) {
+                       unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
+                       struct flow_keys keys;
+
+                       /* short-circuit if we already have L4 hash present */
+                       if (skb->l4_hash)
+                               return skb_get_hash_raw(skb) >> 1;
+
+                       memset(&hash_keys, 0, sizeof(hash_keys));
+
+                       skb_flow_dissect_flow_keys(skb, &keys, flag);
+
+                       hash_keys.control.addr_type = 
FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+                       hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
+                       hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
+                       hash_keys.ports.src = keys.ports.src;
+                       hash_keys.ports.dst = keys.ports.dst;
+                       hash_keys.basic.ip_proto = keys.basic.ip_proto;
+               } else {
+                       memset(&hash_keys, 0, sizeof(hash_keys));
+                       hash_keys.control.addr_type = 
FLOW_DISSECTOR_KEY_IPV6_ADDRS;
+                       hash_keys.addrs.v6addrs.src = fl6->saddr;
+                       hash_keys.addrs.v6addrs.dst = fl6->daddr;
+                       hash_keys.ports.src = fl6->fl6_sport;
+                       hash_keys.ports.dst = fl6->fl6_dport;
+                       hash_keys.basic.ip_proto = fl6->flowi6_proto;
+               }
        }
        mhash = flow_hash_from_keys(&hash_keys);
 
@@ -1858,7 +1892,7 @@ void ip6_route_input(struct sk_buff *skb)
        if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
                fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
        if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
-               fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
+               fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb);
        skb_dst_drop(skb);
        skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 }
diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c
index 262f791f1b9b..6125accc993a 100644
--- a/net/ipv6/sysctl_net_ipv6.c
+++ b/net/ipv6/sysctl_net_ipv6.c
@@ -16,14 +16,31 @@
 #include <net/ipv6.h>
 #include <net/addrconf.h>
 #include <net/inet_frag.h>
+#include <net/netevent.h>
 #ifdef CONFIG_NETLABEL
 #include <net/calipso.h>
 #endif
 
+static int zero;
 static int one = 1;
 static int auto_flowlabels_min;
 static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX;
 
+static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write,
+                                         void __user *buffer, size_t *lenp,
+                                         loff_t *ppos)
+{
+       struct net *net;
+       int ret;
+
+       net = container_of(table->data, struct net,
+                          ipv6.sysctl.multipath_hash_policy);
+       ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+       if (write && ret == 0)
+               call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net);
+
+       return ret;
+}
 
 static struct ctl_table ipv6_table_template[] = {
        {
@@ -126,6 +143,15 @@ static struct ctl_table ipv6_table_template[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec
        },
+       {
+               .procname       = "fib_multipath_hash_policy",
+               .data           = &init_net.ipv6.sysctl.multipath_hash_policy,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_rt6_multipath_hash_policy,
+               .extra1         = &zero,
+               .extra2         = &one,
+       },
        { }
 };
 
-- 
2.11.0

Reply via email to