On 7/12/18, 9:05 AM, "netdev-ow...@vger.kernel.org on behalf of Yuchung Cheng" 
<netdev-ow...@vger.kernel.org on behalf of ych...@google.com> wrote:

    Previously, when a data segment was sent an ACK was piggybacked
    on the data segment without generating a CA_EVENT_NON_DELAYED_ACK
    event to notify congestion control modules. So the DCTCP
    ca->delayed_ack_reserved flag could incorrectly stay set when
    in fact there were no delayed ACKs being reserved. This could result
    in sending a special ECN notification ACK that carries an older
    ACK sequence, when in fact there was no need for such an ACK.
    DCTCP keeps track of the delayed ACK status with its own separate
    state ca->delayed_ack_reserved. Previously it may accidentally cancel
    the delayed ACK without updating this field upon sending a special
    ACK that carries a older ACK sequence. This inconsistency would
    lead to DCTCP receiver never acknowledging the latest data until the
    sender times out and retry in some cases.
    
    Packetdrill script (provided by Larry Brakmo)
    
    0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
    0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
    0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
    0.000 bind(3, ..., ...) = 0
    0.000 listen(3, 1) = 0
    
    0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
    0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
    0.110 < [ect0] . 1:1(0) ack 1 win 257
    0.200 accept(3, ..., ...) = 4
    
    0.200 < [ect0] . 1:1001(1000) ack 1 win 257
    0.200 > [ect01] . 1:1(0) ack 1001
    
    0.200 write(4, ..., 1) = 1
    0.200 > [ect01] P. 1:2(1) ack 1001
    
    0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
    0.200 write(4, ..., 1) = 1
    0.200 > [ect01] P. 2:3(1) ack 2001
    
    0.200 < [ect0] . 2001:3001(1000) ack 3 win 257
    0.200 < [ect0] . 3001:4001(1000) ack 3 win 257
    0.200 > [ect01] . 3:3(0) ack 4001
    
    0.210 < [ce] P. 4001:4501(500) ack 3 win 257
    
    +0.001 read(4, ..., 4500) = 4500
    +0 write(4, ..., 1) = 1
    +0 > [ect01] PE. 3:4(1) ack 4501
    
    +0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257
    // Previously the ACK sequence below would be 4501, causing a long RTO
    +0.040~+0.045 > [ect01] . 4:4(0) ack 5501   // delayed ack
    
    +0.311 < [ect0] . 5501:6501(1000) ack 4 win 257  // More data
    +0 > [ect01] . 4:4(0) ack 6501     // now acks everything
    
    +0.500 < F. 9501:9501(0) ack 4 win 257
    
    Reported-by: Larry Brakmo <bra...@fb.com>
    Signed-off-by: Yuchung Cheng <ych...@google.com>
    Signed-off-by: Eric Dumazet <eduma...@google.com>
    Acked-by: Neal Cardwell <ncardw...@google.com>
    ---
     net/ipv4/tcp_dctcp.c | 6 ++++--
     1 file changed, 4 insertions(+), 2 deletions(-)
    
    diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c
    index 5f5e5936760e..89f88b0d8167 100644
    --- a/net/ipv4/tcp_dctcp.c
    +++ b/net/ipv4/tcp_dctcp.c
    @@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct sock *sk)
        /* State has changed from CE=0 to CE=1 and delayed
         * ACK has not sent yet.
         */
    -   if (!ca->ce_state && ca->delayed_ack_reserved) {
    +   if (!ca->ce_state &&
    +       inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
                u32 tmp_rcv_nxt;
     
                /* Save current rcv_nxt. */
    @@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct sock *sk)
        /* State has changed from CE=1 to CE=0 and delayed
         * ACK has not sent yet.
         */
    -   if (ca->ce_state && ca->delayed_ack_reserved) {
    +   if (ca->ce_state &&
    +       inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
                u32 tmp_rcv_nxt;
     
                /* Save current rcv_nxt. */
    -- 
    2.18.0.203.gfac676dfb9-goog
    
LGTM. Thanks for the patch.

Acked-by: Lawrence Brakmo <bra...@fb.com>
    

Reply via email to