Hi Yu He,

This code is simply using integer arithmetics (float is not really possible in 
the kernel), left-shifting the fractional value of g by 1024 (10 bits).

Max_alpha_value = 1024 is “1” shifted left by 10.

Agreed that this is not clearly documented, and I believe the sysctl handler 
also is not properly implemented to adjust this value.

I thought I had been working on this…

Ah, here is it. I was trying to implement the Fluid-model DCTCP for much better 
RTT fairness, but apparently got distracted before putting on a Diff.
Fluid-model DCTCP was analyzed by the original authors of DCTCP, and basically 
adjusts cwnd fractionally immediately after a CE is received, instead of once 
at the end of a window. The update to Alpha is kept to once per window, to keep 
the bookkeeping easy and straight forward.

For reference, here is the partial code I came up with.

I’ll break this into an initial Diff to fix the sysctl tunables, as soon as I 
can. Would appreciate any help in getting the fluid-model improvement fully 
tested though.


[root@freebsd ~/netinet/cc]# git diff master cc_dctcp.c
diff --git a/sys/netinet/cc/cc_dctcp.c b/sys/netinet/cc/cc_dctcp.c
index 9affd0da2b3..778ff7a8477 100644
--- a/sys/netinet/cc/cc_dctcp.c
+++ b/sys/netinet/cc/cc_dctcp.c
@@ -56,7 +56,38 @@ __FBSDID("$FreeBSD$");
#include <netinet/cc/cc.h>
#include <netinet/cc/cc_module.h>

-#define MAX_ALPHA_VALUE 1024
+#define DCTCP_SHIFT 10
+#define MAX_ALPHA_VALUE 1<<DCTCP_SHIFT
VNET_DEFINE_STATIC(uint32_t, dctcp_alpha) = 0;
#define V_dctcp_alpha      VNET(dctcp_alpha)
VNET_DEFINE_STATIC(uint32_t, dctcp_shift_g) = 4;
@@ -65,14 +96,15 @@ VNET_DEFINE_STATIC(uint32_t, dctcp_slowstart) = 0;
#define        V_dctcp_slowstart   VNET(dctcp_slowstart)

struct dctcp {
-       int     bytes_ecn;      /* # of marked bytes during a RTT */
-       int     bytes_total;    /* # of acked bytes during a RTT */
-       int     alpha;          /* the fraction of marked bytes */
-       int     ce_prev;        /* CE state of the last segment */
-       int     save_sndnxt;    /* end sequence number of the current window */
-       int     ece_curr;       /* ECE flag in this segment */
-       int     ece_prev;       /* ECE flag in the last segment */
-       uint32_t    num_cong_events; /* # of congestion events */
+       uint32_t bytes_ecn;     /* # of marked bytes during a RTT */
+       uint32_t bytes_total;   /* # of acked bytes during a RTT */
+       int      alpha;         /* the fraction of marked bytes */
+       int      ce_prev;       /* CE state of the last segment */
+       tcp_seq  save_sndnxt;   /* end sequence number of the current window */
+                               // XXRMS: can't a rtt_window marker in tcpcb be 
reused?
+       int      ece_curr;      /* ECE flag in this segment */
+       int      ece_prev;      /* ECE flag in the last segment */
+       uint32_t num_cong_events; /* # of congestion events */
};

static MALLOC_DEFINE(M_dctcp, "dctcp data",
@@ -107,6 +139,8 @@ dctcp_ack_received(struct cc_var *ccv, uint16_t type)
        int bytes_acked = 0;

        dctcp_data = ccv->cc_data;

        if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
                /*
@@ -122,7 +156,7 @@ dctcp_ack_received(struct cc_var *ccv, uint16_t type)
                        newreno_cc_algo.ack_received(ccv, type);

                if (type == CC_DUPACK)
-                       bytes_acked = CCV(ccv, t_maxseg);
+                       bytes_acked = CCV(ccv, t_maxseg); //XXRMS: todo, use 
sacked data?

                if (type == CC_ACK)
                        bytes_acked = ccv->bytes_this_ack;
@@ -132,6 +166,7 @@ dctcp_ack_received(struct cc_var *ccv, uint16_t type)

                /* Update total marked bytes. */
                if (dctcp_data->ece_curr) {
+                       // Fluid-model of DCTCP for RTT fairness here (adjust 
cwnd on each ACK, rather than once per window)
                        if (!dctcp_data->ece_prev
                            && bytes_acked > CCV(ccv, t_maxseg)) {
                                dctcp_data->bytes_ecn +=
@@ -143,10 +178,13 @@ dctcp_ack_received(struct cc_var *ccv, uint16_t type)
                        if (dctcp_data->ece_prev
                            && bytes_acked > CCV(ccv, t_maxseg))
                                dctcp_data->bytes_ecn += CCV(ccv, t_maxseg);
+//                                 (bytes_acked - CCV(ccv, t_maxseg));
                        dctcp_data->ece_prev = 0;
                }
                dctcp_data->ece_curr = 0;

                /*
                 * Update the fraction of marked bytes at the end of
                 * current window size.

static void
@@ -165,18 +205,21 @@ dctcp_after_idle(struct cc_var *ccv)
{
        struct dctcp *dctcp_data;

-       dctcp_data = ccv->cc_data;
+       if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {

-       /* Initialize internal parameters after idle time */
-       dctcp_data->bytes_ecn = 0;
-       dctcp_data->bytes_total = 0;
-       dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
-       dctcp_data->alpha = V_dctcp_alpha;
-       dctcp_data->ece_curr = 0;
-       dctcp_data->ece_prev = 0;
-       dctcp_data->num_cong_events = 0;
+               dctcp_data = ccv->cc_data;

-       dctcp_cc_algo.after_idle = newreno_cc_algo.after_idle;
+               /* Initialize internal parameters after idle time */
+               dctcp_data->bytes_ecn = 0;
+               dctcp_data->bytes_total = 0;
+               dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
+               dctcp_data->alpha = V_dctcp_alpha << DCTCP_SHIFT;
+               dctcp_data->ece_curr = 0;
+               dctcp_data->ece_prev = 0;
+               dctcp_data->num_cong_events = 0;
+       }
+
+       newreno_cc_algo.after_idle(ccv);
}

static void
@@ -209,7 +252,7 @@ dctcp_cb_init(struct cc_var *ccv)
         * Note: DCTCP draft suggests initial alpha to be 1 but we've decided to
         * keep it 0 as default.
         */
-       dctcp_data->alpha = V_dctcp_alpha;
+       dctcp_data->alpha = V_dctcp_alpha << DCTCP_SHIFT;
        dctcp_data->save_sndnxt = 0;
        dctcp_data->ce_prev = 0;
        dctcp_data->ece_curr = 0;
@@ -227,63 +270,73 @@ static void
dctcp_cong_signal(struct cc_var *ccv, uint32_t type)
{
        struct dctcp *dctcp_data;
-       u_int win, mss;
+       u_int cwnd, mss;

-       dctcp_data = ccv->cc_data;
-       win = CCV(ccv, snd_cwnd);
-       mss = CCV(ccv, t_maxseg);
+       if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {

-       switch (type) {
-       case CC_NDUPACK:
-               if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+               dctcp_data = ccv->cc_data;
+               cwnd = CCV(ccv, snd_cwnd);
+               mss = CCV(ccv, t_maxseg);
+
+               //XXRMS: check if session is ECN-enabled, else do newreno
+
+               switch (type) {
+               case CC_NDUPACK:
+                       if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+                               if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
+                                       CCV(ccv, snd_ssthresh) =
+                                           max(cwnd / 2, 2 * mss);
+                                       dctcp_data->num_cong_events++;
+                               } else {
+                                       /* cwnd has already updated as 
congestion
+                                        * recovery. Reverse cwnd value using
+                                        * snd_cwnd_prev and recalculate 
snd_ssthresh
+                                        */
+                                       cwnd = CCV(ccv, snd_cwnd_prev);
+                                       CCV(ccv, snd_ssthresh) =
+                                           max(cwnd / 2, 2 * mss);
+                               }
+                               ENTER_RECOVERY(CCV(ccv, t_flags));
+                       }
+                       break;
+               case CC_ECN:
+                       /*
+                        * Save current snd_cwnd when the host encounters both
+                        * congestion recovery and fast recovery.
+                        */
+                       CCV(ccv, snd_cwnd_prev) = cwnd;
                        if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
-                               CCV(ccv, snd_ssthresh) = mss *
-                                   max(win / 2 / mss, 2);
+                               if (V_dctcp_slowstart &&
+                                   dctcp_data->num_cong_events++ == 0) {
+                                       CCV(ccv, snd_ssthresh) =
+                                           max(cwnd / 2, 2 * mss);
+                                       dctcp_data->alpha = MAX_ALPHA_VALUE;
+                                       dctcp_data->bytes_ecn = 0;
+                                       dctcp_data->bytes_total = 0;
+                                       dctcp_data->save_sndnxt = CCV(ccv, 
snd_nxt);
+                               } else
+                                       CCV(ccv, snd_ssthresh) =
+                                           max((cwnd - (((uint64_t)cwnd *
+                                           dctcp_data->alpha) >> 
DCTCP_SHIFT)/2),
+                                           2 * mss); //XXRMS: define magic 
number for shift
+                               CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
+                               ENTER_CONGRECOVERY(CCV(ccv, t_flags));
+                       }
+                       dctcp_data->ece_curr = 1;
+                       break;
+               case CC_RTO:
+                       if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
+                               CCV(ccv, t_flags) |= TF_ECN_SND_CWR;
+                               dctcp_update_alpha(ccv);
+                               dctcp_data->save_sndnxt += CCV(ccv, t_maxseg);
                                dctcp_data->num_cong_events++;
-                       } else {
-                               /* cwnd has already updated as congestion
-                                * recovery. Reverse cwnd value using
-                                * snd_cwnd_prev and recalculate snd_ssthresh
-                                */
-                               win = CCV(ccv, snd_cwnd_prev);
-                               CCV(ccv, snd_ssthresh) =
-                                   max(win / 2 / mss, 2) * mss;
                        }
-                       ENTER_RECOVERY(CCV(ccv, t_flags));
-               }
-               break;
-       case CC_ECN:
-               /*
-                * Save current snd_cwnd when the host encounters both
-                * congestion recovery and fast recovery.
-                */
-               CCV(ccv, snd_cwnd_prev) = win;
-               if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
-                       if (V_dctcp_slowstart &&
-                           dctcp_data->num_cong_events++ == 0) {
-                               CCV(ccv, snd_ssthresh) =
-                                   mss * max(win / 2 / mss, 2);
-                               dctcp_data->alpha = MAX_ALPHA_VALUE;
-                               dctcp_data->bytes_ecn = 0;
-                               dctcp_data->bytes_total = 0;
-                               dctcp_data->save_sndnxt = CCV(ccv, snd_nxt);
-                       } else
-                               CCV(ccv, snd_ssthresh) = max((win - ((win *
-                                   dctcp_data->alpha) >> 11)) / mss, 2) * mss;
-                       CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
-                       ENTER_CONGRECOVERY(CCV(ccv, t_flags));
-               }
-               dctcp_data->ece_curr = 1;
-               break;
-       case CC_RTO:
-               if (CCV(ccv, t_flags) & TF_ECN_PERMIT) {
-                       CCV(ccv, t_flags) |= TF_ECN_SND_CWR;
-                       dctcp_update_alpha(ccv);
-                       dctcp_data->save_sndnxt += CCV(ccv, t_maxseg);
-                       dctcp_data->num_cong_events++;
+                       break;
                }
-               break;
-       }
+       } else
+               newreno_cc_algo.newreno_cong_signal(ccv, type);
}

static void
@@ -303,7 +356,7 @@ dctcp_conn_init(struct cc_var *ccv)
static void
dctcp_post_recovery(struct cc_var *ccv)
{
-       dctcp_cc_algo.post_recovery = newreno_cc_algo.post_recovery;
+       newreno_cc_algo.post_recovery(ccv);

        if (CCV(ccv, t_flags) & TF_ECN_PERMIT)
                dctcp_update_alpha(ccv);
@@ -328,9 +381,12 @@ dctcp_ecnpkt_handler(struct cc_var *ccv)
        ccflag = ccv->flags;
        delay_ack = 1;

        /*
-        * DCTCP responses an ACK immediately when the CE state
-        * in between this segment and the last segment is not same.
+        * DCTCP responds with an ACK immediately when the CE state
+        * in between this segment and the last segment has changed.
         */
        if (ccflag & CCF_IPHDR_CE) {
                if (!dctcp_data->ce_prev && (ccflag & CCF_DELACK))
@@ -350,8 +406,11 @@ dctcp_ecnpkt_handler(struct cc_var *ccv)

        if (delay_ack == 0)
                ccv->flags |= CCF_ACKNOW;
-       else
-               ccv->flags &= ~CCF_ACKNOW;
+//     else
+//             ccv->flags &= ~CCF_ACKNOW; //XXRMS: shouldn't we leave this 
alone here?
}

/*
@@ -366,7 +425,8 @@ dctcp_update_alpha(struct cc_var *ccv)

        dctcp_data = ccv->cc_data;
        alpha_prev = dctcp_data->alpha;
-       dctcp_data->bytes_total = max(dctcp_data->bytes_total, 1);
+       if (dctcp_data->bytes_total < 1)
+               dctcp_data->bytes_total = 1;

        /*
         * Update alpha: alpha = (1 - g) * alpha + g * F.
@@ -379,8 +439,8 @@ dctcp_update_alpha(struct cc_var *ccv)
         *      updated every RTT
         * Alpha must be round to 0 - MAX_ALPHA_VALUE.
         */
-       dctcp_data->alpha = min(alpha_prev - (alpha_prev >> V_dctcp_shift_g) +
-           (dctcp_data->bytes_ecn << (10 - V_dctcp_shift_g)) /
+       dctcp_data->alpha = ulmin(alpha_prev - (alpha_prev >> V_dctcp_shift_g) +
+           ((uint64_t)dctcp_data->bytes_ecn << (DCTCP_SHIFT - 
V_dctcp_shift_g)) /
            dctcp_data->bytes_total, MAX_ALPHA_VALUE);

        /* Initialize internal parameters for next alpha calculation */
@@ -398,14 +458,10 @@ dctcp_alpha_handler(SYSCTL_HANDLER_ARGS)
        new = V_dctcp_alpha;
        error = sysctl_handle_int(oidp, &new, 0, req);
        if (error == 0 && req->newptr != NULL) {
-               if (new > 1)
+               if (new > 1) //XXRMS: this only effectively allows dctcp_alpha 
to be set to 0 or 1?
                        error = EINVAL;
-               else {
-                       if (new > MAX_ALPHA_VALUE)
-                               V_dctcp_alpha = MAX_ALPHA_VALUE;
-                       else
-                               V_dctcp_alpha = new;
-               }
+               else
+                       V_dctcp_alpha = new;
        }

        return (error);
@@ -420,10 +476,15 @@ dctcp_shift_g_handler(SYSCTL_HANDLER_ARGS)
        new = V_dctcp_shift_g;
        error = sysctl_handle_int(oidp, &new, 0, req);
        if (error == 0 && req->newptr != NULL) {
-               if (new > 1)
+               if (new < 0) //XXRMS: ditto
                        error = EINVAL;
-               else
-                       V_dctcp_shift_g = new;
+               else {
+                       if (new > DCTCP_SHIFT) {
+                               V_dctcp_shift_g = DCTCP_SHIFT;
+                               error = EINVAL;
+                       } else
+                               V_dctcp_shift_g = new;
+               }
        }

        return (error);
@@ -438,7 +499,7 @@ dctcp_slowstart_handler(SYSCTL_HANDLER_ARGS)
        new = V_dctcp_slowstart;
        error = sysctl_handle_int(oidp, &new, 0, req);
        if (error == 0 && req->newptr != NULL) {
-               if (new > 1)
+               if ((new > 1) || (new < 0))
                        error = EINVAL;
                else
                        V_dctcp_slowstart = new;
@@ -454,7 +515,7 @@ SYSCTL_NODE(_net_inet_tcp_cc, OID_AUTO, dctcp, CTLFLAG_RW, 
NULL,
SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, alpha,
     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_alpha), 0,
     &dctcp_alpha_handler,
-    "IU", "dctcp alpha parameter");
+    "IU", "dctcp alpha parameter at start of session");

SYSCTL_PROC(_net_inet_tcp_cc_dctcp, OID_AUTO, shift_g,
     CTLFLAG_VNET|CTLTYPE_UINT|CTLFLAG_RW, &VNET_NAME(dctcp_shift_g), 4,


Richard Scheffenegger


Begin forwarded message:

From: Yu He via freebsd-net 
<freebsd-net@freebsd.org<mailto:freebsd-net@freebsd.org>>
Subject: Some question about DCTCP implementation in FreeBSD
Date: June 4, 2019 at 11:05:34 PDT
To: "freebsd-net@freebsd.org<mailto:freebsd-net@freebsd.org>" 
<freebsd-net@freebsd.org<mailto:freebsd-net@freebsd.org>>
Cc: Gopakumar Pillai <gpil...@vmware.com<mailto:gpil...@vmware.com>>
Reply-To: Yu He <y...@vmware.com<mailto:y...@vmware.com>>

Greetings!

I’m a graduate student from Carnegie Mellon University, and currently an intern 
in VMware. I’m now working on some internal project about implementing DCTCP 
algorithm and coming across the implementation in 
https://reviews.freebsd.org/rS277054. There is one thing I’m quite curious 
about.

In line 387 of file cc_tcp.c, the update of alpha is calculated by following 
code:

    dctcp_data->alpha = min(alpha_prev - (alpha_prev >> V_dctcp_shift_g) +
        (dctcp_data->bytes_ecn << (10 - V_dctcp_shift_g)) /
        dctcp_data->bytes_total, MAX_ALPHA_VALUE);

As the update formula from the original paper is alpha = (1 - g) * alpha + g * 
F, I’m wandering about what the intention is of using left-shift when 
calculating the g * F part, which might seemingly multiplying the value rather 
than dividing it as suggested by the previous code. Let alone the operand (10 - 
  V_dctcp_shift_g) might be a negative value, which will lead to an undefined 
behavior in C.

Because this is the very central formula of DCTCP algorithm, I’m feeling it 
necessary to have an explanation from the original authors.

Or if in another way this is actually a bug, I’m willing to improve it.

Best,

Yu He
Intern-Product Development-NSBU,
VMware

Master of Science, Information Networking,
Carnegie Mellon University



_______________________________________________
freebsd-net@freebsd.org<mailto:freebsd-net@freebsd.org> mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-net
To unsubscribe, send any mail to 
"freebsd-net-unsubscr...@freebsd.org<mailto:freebsd-net-unsubscr...@freebsd.org>"

_______________________________________________
freebsd-net@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-net
To unsubscribe, send any mail to "freebsd-net-unsubscr...@freebsd.org"

Reply via email to