In EDT design, I made the mistake of using tcp_wstamp_ns
to store the last tcp_clock_ns() sample and to store the
pacing virtual timer.

This causes major regressions at high speed flows.

Introduce tcp_clock_cache to store last tcp_clock_ns().
This is needed because some arches have slow high-resolution
kernel time service.

tcp_wstamp_ns is only updated when a packet is sent.

Note that we can remove tcp_mstamp in the future since
tcp_mstamp is essentially tcp_clock_cache/1000, so the
apparent socket size increase is temporary.

Fixes: 9799ccb0e984 ("tcp: add tcp_wstamp_ns socket field")
Signed-off-by: Eric Dumazet <eduma...@google.com>
Acked-by: Soheil Hassas Yeganeh <soh...@google.com>
---
 include/linux/tcp.h   | 1 +
 net/ipv4/tcp_output.c | 9 ++++++---
 net/ipv4/tcp_timer.c  | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 
848f5b25e178288ce870637b68a692ab88dc7d4d..8ed77bb4ed8636e9294389a011529fd9a667dce4
 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -249,6 +249,7 @@ struct tcp_sock {
        u32     tlp_high_seq;   /* snd_nxt at the time of TLP retransmit. */
 
        u64     tcp_wstamp_ns;  /* departure time for next sent data packet */
+       u64     tcp_clock_cache; /* cache last tcp_clock_ns() (see 
tcp_mstamp_refresh()) */
 
 /* RTT measurement */
        u64     tcp_mstamp;     /* most recent packet received/sent */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 
059b67af28b137fb9566eaef370b270fc424bffb..f14df66a0c858dcb22b8924b9691c375eb5fcbc5
 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -52,9 +52,8 @@ void tcp_mstamp_refresh(struct tcp_sock *tp)
 {
        u64 val = tcp_clock_ns();
 
-       /* departure time for next data packet */
-       if (val > tp->tcp_wstamp_ns)
-               tp->tcp_wstamp_ns = val;
+       if (val > tp->tcp_clock_cache)
+               tp->tcp_clock_cache = val;
 
        val = div_u64(val, NSEC_PER_USEC);
        if (val > tp->tcp_mstamp)
@@ -1050,6 +1049,10 @@ static int __tcp_transmit_skb(struct sock *sk, struct 
sk_buff *skb,
                if (unlikely(!skb))
                        return -ENOBUFS;
        }
+
+       /* TODO: might take care of jitter here */
+       tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
+
        skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
 
        inet = inet_sk(sk);
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 
61023d50cd604d5e19464a32c33b65d29c75c81e..676020663ce80a79341ad1a05352742cc8dd5850
 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -360,7 +360,7 @@ static void tcp_probe_timer(struct sock *sk)
         */
        start_ts = tcp_skb_timestamp(skb);
        if (!start_ts)
-               skb->skb_mstamp_ns = tp->tcp_wstamp_ns;
+               skb->skb_mstamp_ns = tp->tcp_clock_cache;
        else if (icsk->icsk_user_timeout &&
                 (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout)
                goto abort;
-- 
2.19.0.605.g01d371f741-goog

Reply via email to