Currently sk_rethink_txhash() re-rolls the socket's txhash on RTO, PLB,
and spurious-retransmission events, but the cached route is reused and
the new hash is not propagated into the ECMP path selection logic.  Two
changes are needed to make rehash select a different local ECMP path:

1. Add __sk_dst_reset() alongside sk_rethink_txhash() in
   tcp_write_timeout(), tcp_rcv_spurious_retrans(), and
   tcp_plb_check_rehash() so the cached dst is invalidated and the
   next transmit triggers a fresh route lookup.

2. Set fl6->mp_hash from sk_txhash (or tcp_rsk(req)->txhash for
   SYN/ACK retransmits) in inet6_sk_rebuild_header(),
   inet6_csk_route_req(), and inet6_csk_route_socket() so
   fib6_select_path() picks a path based on the new hash.

   It is necessary to update mp_hash explicitly because the
   default ECMP hash derives from fl6->flowlabel via
   np->flow_label, which is not updated from sk_txhash
   (REPFLOW is off by default).  ip6_make_flowlabel() cannot
   help either, as it runs after the route lookup.

The dst reset is guarded by sk->sk_family == AF_INET6 since IPv4
ECMP does not currently use sk_txhash for path selection.

tcp_rsk(req)->txhash initialization is moved before route_req() in
tcp_conn_request() so that inet6_csk_route_req() reads a valid hash
on the initial SYN/ACK.

Signed-off-by: Neil Spring <[email protected]>
---
 net/ipv4/tcp_input.c             | 6 ++++--
 net/ipv4/tcp_plb.c               | 7 ++++++-
 net/ipv4/tcp_timer.c             | 4 ++++
 net/ipv6/af_inet6.c              | 3 +++
 net/ipv6/inet6_connection_sock.c | 6 ++++++
 5 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7995a89bafc9..8f602a665b71 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -5020,8 +5020,10 @@ static void tcp_rcv_spurious_retrans(struct sock *sk,
            skb->protocol == htons(ETH_P_IPV6) &&
            (tcp_sk(sk)->inet_conn.icsk_ack.lrcv_flowlabel !=
             ntohl(ip6_flowlabel(ipv6_hdr(skb)))) &&
-           sk_rethink_txhash(sk))
+           sk_rethink_txhash(sk)) {
                NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH);
+               __sk_dst_reset(sk);
+       }
 
        /* Save last flowlabel after a spurious retrans. */
        tcp_save_lrcv_flowlabel(sk, skb);
@@ -7636,6 +7638,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
        tcp_rsk(req)->af_specific = af_ops;
        tcp_rsk(req)->ts_off = 0;
        tcp_rsk(req)->req_usec_ts = false;
+       tcp_rsk(req)->txhash = net_tx_rndhash();
 #if IS_ENABLED(CONFIG_MPTCP)
        tcp_rsk(req)->is_mptcp = 0;
 #endif
@@ -7717,7 +7720,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
        }
 #endif
        tcp_rsk(req)->snt_isn = isn;
-       tcp_rsk(req)->txhash = net_tx_rndhash();
        tcp_rsk(req)->syn_tos = TCP_SKB_CB(skb)->ip_dsfield;
        tcp_openreq_init_rwin(req, sk, dst);
        sk_rx_queue_set(req_to_sk(req), skb);
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
index c11a0cd3f8fe..accdd83dfc3d 100644
--- a/net/ipv4/tcp_plb.c
+++ b/net/ipv4/tcp_plb.c
@@ -78,7 +78,12 @@ void tcp_plb_check_rehash(struct sock *sk, struct 
tcp_plb_state *plb)
        if (plb->pause_until)
                return;
 
-       sk_rethink_txhash(sk);
+       if (sk_rethink_txhash(sk)) {
+#if IS_ENABLED(CONFIG_IPV6)
+               if (sk->sk_family == AF_INET6)
+                       __sk_dst_reset(sk);
+#endif
+       }
        plb->consec_cong_rounds = 0;
        WRITE_ONCE(tcp_sk(sk)->plb_rehash, tcp_sk(sk)->plb_rehash + 1);
        NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 322db13333c7..24c1c19eda6e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -300,6 +300,10 @@ static int tcp_write_timeout(struct sock *sk)
        if (sk_rethink_txhash(sk)) {
                WRITE_ONCE(tp->timeout_rehash, tp->timeout_rehash + 1);
                __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH);
+#if IS_ENABLED(CONFIG_IPV6)
+               if (sk->sk_family == AF_INET6)
+                       __sk_dst_reset(sk);
+#endif
        }
 
        return 0;
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 0a88b376141d..90ff4448aa56 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -823,6 +823,9 @@ int inet6_sk_rebuild_header(struct sock *sk)
        fl6->flowi6_uid = sk_uid(sk);
        security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
 
+       /* >> 1 for 31-bit mp_hash range matching nhc_upper_bound. */
+       fl6->mp_hash = sk->sk_txhash >> 1;
+
        rcu_read_lock();
        final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &np->final);
        rcu_read_unlock();
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 37534e116899..fc4b75de6af8 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -48,6 +48,9 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
        fl6->flowi6_uid = sk_uid(sk);
        security_req_classify_flow(req, flowi6_to_flowi_common(fl6));
 
+       /* >> 1 for 31-bit mp_hash range matching nhc_upper_bound. */
+       fl6->mp_hash = tcp_rsk(req)->txhash >> 1;
+
        if (!dst) {
                dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
                if (IS_ERR(dst))
@@ -70,6 +73,9 @@ struct dst_entry *inet6_csk_route_socket(struct sock *sk,
        fl6->saddr = np->saddr;
        fl6->flowlabel = np->flow_label;
        IP6_ECN_flow_xmit(sk, fl6->flowlabel);
+
+       /* >> 1 for 31-bit mp_hash range matching nhc_upper_bound. */
+       fl6->mp_hash = sk->sk_txhash >> 1;
        fl6->flowi6_oif = sk->sk_bound_dev_if;
        fl6->flowi6_mark = sk->sk_mark;
        fl6->fl6_sport = inet->inet_sport;
-- 
2.53.0-Meta


Reply via email to