Hi tech@,

I started implementing CoDel[1] in altq, as an alternative to RED. It's based on
the Linux implementation, which is dual-licensed GPL/BSD. I'm looking for early
feedback (design, big issues, etc.). Work-in-progress diff is below.

The big issues I see are:

- CoDel needs to timestamp each packet when it gets queued.
  - I added a new member in struct pkthdr for this. Is this ok?
  - I end up calling microuptime() for each packet. Is this ok?

- CoDel drops packets at dequeue time, contrary to RED. This means that for ECN
  we need to have the header available, but it is long gone. Altq makes the
  header available to red_enqueue() then forgets about it. I'll figure out a way
  to do this I'm sure (either copy or recompute), but suggestions are
  appreciated.

Any other suggestions/comments/questions very much appreciated.

Thanks,
Simon

[1] http://www.bufferbloat.net/projects/codel/wiki



THIS DIFF DOESN'T WORK. I NEVER TESTED IT. IT'S A WORK IN PROGRESS.

diff --git a/sys/altq/altq_classq.h b/sys/altq/altq_classq.h
index a9ac674..43863b6 100644
--- a/sys/altq/altq_classq.h
+++ b/sys/altq/altq_classq.h
@@ -44,10 +44,11 @@ extern "C" {
 #endif
 
 /*
- * Packet Queue types: RED or DROPHEAD.
+ * Packet Queue types
  */
 #define        Q_DROPHEAD      0x00
 #define        Q_RED           0x01
+#define        Q_CODEL         0x02
 #define        Q_DROPTAIL      0x03
 
 #ifdef _KERNEL
@@ -58,6 +59,7 @@ extern "C" {
 struct _class_queue_ {
        struct mbuf     *tail_; /* Tail of packet queue */
        int     qlen_;          /* Queue length (in number of packets) */
+       int     qsize_;         /* Queue size (in bytes) */
        int     qlim_;          /* Queue limit (in number of packets*) */
        int     qtype_;         /* Queue type */
 };
@@ -67,11 +69,13 @@ typedef struct _class_queue_        class_queue_t;
 #define        qtype(q)        (q)->qtype_             /* Get queue type */
 #define        qlimit(q)       (q)->qlim_              /* Max packets to be 
queued */
 #define        qlen(q)         (q)->qlen_              /* Current queue 
length. */
+#define        qsize(q)        (q)->qsize_             /* Current queue size. 
*/
 #define        qtail(q)        (q)->tail_              /* Tail of the queue */
 #define        qhead(q)        ((q)->tail_ ? (q)->tail_->m_nextpkt : NULL)
 
 #define        qempty(q)       ((q)->qlen_ == 0)       /* Is the queue empty?? 
*/
 #define        q_is_red(q)     ((q)->qtype_ == Q_RED)  /* Is the queue a red 
queue */
+#define        q_is_codel(q)   ((q)->qtype_ == Q_CODEL)/* Is the queue a CoDel 
queue */
 
 #if !defined(__GNUC__) || defined(ALTQ_DEBUG)
 
@@ -98,6 +102,7 @@ _addq(class_queue_t *q, struct mbuf *m)
        m0->m_nextpkt = m;
        qtail(q) = m;
        qlen(q)++;
+       qsize(q) += m_pktlen(m);
 }
 
 static __inline struct mbuf *
@@ -112,6 +117,7 @@ _getq(class_queue_t *q)
        else
                qtail(q) = NULL;
        qlen(q)--;
+       qsize(q) -= m_pktlen(m0);
        m0->m_nextpkt = NULL;
        return (m0);
 }
diff --git a/sys/altq/altq_codel.c b/sys/altq/altq_codel.c
new file mode 100644
index 0000000..7a6c186
--- /dev/null
+++ b/sys/altq/altq_codel.c
@@ -0,0 +1,241 @@
+#include <sys/param.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/socket.h>
+#include <sys/systm.h>
+
+#include <net/if.h>
+
+#include <altq/altq.h>
+#include <altq/altq_codel.h>
+
+/**
+ * struct codel_params - contains codel parameters
+ * @target:    target queue size (in time units)
+ * @interval:  width of moving time window
+ * @ecn:       is Explicit Congestion Notification enabled
+ */
+struct codel_params {
+       u_int64_t       target;
+       u_int64_t       interval;
+       int             ecn;
+};
+
+/**
+ * struct codel_vars - contains codel variables
+ * @count:             how many drops we've done since the last time we
+ *                     entered dropping state
+ * @lastcount:         count at entry to dropping state
+ * @dropping:          set to true if in dropping state
+ * @rec_inv_sqrt:      reciprocal value of sqrt(count) >> 1
+ * @first_above_time:  when we went (or will go) continuously above target
+ *                     for interval
+ * @drop_next:         time to drop next packet, or when we dropped last
+ * @ldelay:            sojourn time of last dequeued packet
+ */
+struct codel_vars {
+       u_int32_t       count;
+       u_int32_t       lastcount;
+       int             dropping;
+       u_int16_t       rec_inv_sqrt;
+       u_int64_t       first_above_time;
+       u_int64_t       drop_next;
+       u_int64_t       ldelay;
+};
+
+struct codel {
+       struct codel_params      params;
+       struct codel_vars        vars;
+       struct codel_stats       stats;
+       u_int32_t                drop_overlimit;
+};
+
+static int              codel_should_drop(struct codel *, class_queue_t *,
+                           struct mbuf *, u_int64_t);
+static void             codel_Newton_step(struct codel_vars *);
+static u_int64_t        codel_control_law(u_int64_t t, u_int64_t, u_int32_t);
+
+#define codel_time_after(a, b)         ((int64_t)(a) - (int64_t)(b) > 0)
+#define codel_time_after_eq(a, b)      ((int64_t)(a) - (int64_t)(b) >= 0)
+#define codel_time_before(a, b)                ((int64_t)(a) - (int64_t)(b) < 
0)
+#define codel_time_before_eq(a, b)     ((int64_t)(a) - (int64_t)(b) <= 0)
+
+struct codel *
+codel_alloc(int target, int interval, int ecn)
+{
+       struct codel    *c;
+
+       c = malloc(sizeof(*c), M_DEVBUF, M_WAITOK|M_ZERO);
+
+       c->params.target = machclk_freq * target / 1000;
+       c->params.interval = machclk_freq * interval / 1000;
+       c->params.ecn = ecn;
+       c->stats.maxpacket = 256;
+
+       return (c);
+}
+
+void
+codel_destroy(struct codel *c)
+{
+       free(c, M_DEVBUF);
+}
+
+int
+codel_addq(struct codel *c, class_queue_t *q, struct mbuf *m)
+{
+       if (qlen(q) < qlimit(q)) {
+               m->m_pkthdr.enqueue_time = read_machclk();
+               _addq(q, m);
+               return (0);
+       }
+       c->drop_overlimit++;
+       m_freem(m);
+       return (-1);
+}
+
+static int
+codel_should_drop(struct codel *c, class_queue_t *q, struct mbuf *m,
+    u_int64_t now)
+{
+       if (m == NULL) {
+               c->vars.first_above_time = 0;
+               return (0);
+       }
+
+       c->vars.ldelay = now - m->m_pkthdr.enqueue_time;
+       c->stats.maxpacket = MAX(c->stats.maxpacket, m_pktlen(m));
+
+       if (codel_time_before(c->vars.ldelay, c->params.target) ||
+           qsize(q) <= c->stats.maxpacket) {
+               /* went below - stay below for at least interval */
+               c->vars.first_above_time = 0;
+               return (0);
+       }
+       if (c->vars.first_above_time == 0) {
+               /* just went above from below. If we stay above
+                * for at least interval we'll say it's ok to drop
+                */
+               c->vars.first_above_time = now + c->params.interval;
+               return (0);
+       }
+       if (codel_time_after(now, c->vars.first_above_time))
+               return (1);
+       return (0);
+}
+
+/*
+ * 
http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Iterative_methods_for_reciprocal_square_roots
+ * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
+ *
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
+ */
+static void codel_Newton_step(struct codel_vars *vars)
+{
+#define REC_INV_SQRT_BITS (8 * sizeof(u_int16_t)) /* or 
sizeof_in_bits(rec_inv_sqrt) */
+/* needed shift to get a Q0.32 number from rec_inv_sqrt */
+#define REC_INV_SQRT_SHIFT (32 - REC_INV_SQRT_BITS)
+
+       u_int32_t invsqrt = ((u_int32_t)vars->rec_inv_sqrt) << 
REC_INV_SQRT_SHIFT;
+       u_int32_t invsqrt2 = ((u_int64_t)invsqrt * invsqrt) >> 32;
+       u_int64_t val = (3LL << 32) - ((u_int64_t)vars->count * invsqrt2);
+
+       val >>= 2; /* avoid overflow in following multiply */
+       val = (val * invsqrt) >> (32 - 2 + 1);
+
+       vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT;
+}
+
+static u_int64_t
+codel_control_law(u_int64_t t, u_int64_t interval, u_int32_t rec_inv_sqrt)
+{
+       return (t + (u_int32_t)(((u_int64_t)interval *
+           (rec_inv_sqrt << REC_INV_SQRT_SHIFT)) >> 32));
+}
+
+struct mbuf *
+codel_getq(struct codel *c, class_queue_t *q)
+{
+       struct mbuf     *m;
+       u_int64_t        now;
+       int              drop;
+
+       if ((m = _getq(q)) == NULL) {
+               c->vars.dropping = 0;
+               return (m);
+       }
+
+       now = read_machclk();
+       drop = codel_should_drop(c, q, m, now);
+       if (c->vars.dropping) {
+               if (!drop) {
+                       /* sojourn time below target - leave dropping state */
+                       c->vars.dropping = 0;
+               } else if (codel_time_after_eq(now, c->vars.drop_next)) {
+                       /* It's time for the next drop. Drop the current
+                        * packet and dequeue the next. The dequeue might
+                        * take us out of dropping state.
+                        * If not, schedule the next drop.
+                        * A large backlog might result in drop rates so high
+                        * that the next drop should happen now,
+                        * hence the while loop.
+                        */
+                       while (c->vars.dropping &&
+                           codel_time_after_eq(now, c->vars.drop_next)) {
+                               c->vars.count++; /* don't care of possible wrap
+                                                 * since there is no more
+                                                 * divide */
+                               codel_Newton_step(&c->vars);
+                               /* TODO ECN */
+                               PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m));
+                               m_freem(m);
+                               m = _getq(q);
+                               if (!codel_should_drop(c, q, m, now))
+                                       /* leave dropping state */
+                                       c->vars.dropping = 0;
+                               else
+                                       /* and schedule the next drop */
+                                       c->vars.drop_next =
+                                           codel_control_law(c->vars.drop_next,
+                                               c->params.interval,
+                                               c->vars.rec_inv_sqrt);
+                       }
+               }
+       } else if (drop) {
+               /* TODO ECN */
+               PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m));
+               m_freem(m);
+
+               m = _getq(q);
+               drop = codel_should_drop(c, q, m, now);
+
+               c->vars.dropping = 1;
+               /* if min went above target close to when we last went below it
+                * assume that the drop rate that controlled the queue on the
+                * last cycle is a good starting point to control it now.
+                */
+               if (codel_time_before(now - c->vars.drop_next,
+                   16 * c->params.interval)) {
+                       c->vars.count = (c->vars.count - c->vars.lastcount) | 1;
+                       /* we dont care if rec_inv_sqrt approximation
+                        * is not very precise :
+                        * Next Newton steps will correct it quadratically.
+                        */
+                       codel_Newton_step(&c->vars);
+               } else {
+                       c->vars.count = 1;
+                       c->vars.rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT;
+               }
+               c->vars.lastcount = c->vars.count;
+               c->vars.drop_next = codel_control_law(now, c->params.interval,
+                   c->vars.rec_inv_sqrt);
+       }
+
+       return m;
+}
+
+void
+codel_getstats(struct codel *c, struct codel_stats *s)
+{
+       *s = c->stats;
+}
diff --git a/sys/altq/altq_codel.h b/sys/altq/altq_codel.h
new file mode 100644
index 0000000..4daae2d
--- /dev/null
+++ b/sys/altq/altq_codel.h
@@ -0,0 +1,20 @@
+#ifndef _ALTQ_ALTQ_CODEL_H_
+#define _ALTQ_ALTQ_CODEL_H_
+
+#include <altq/altq_classq.h>
+
+struct codel;
+
+struct codel_stats {
+       u_int32_t        maxpacket;
+       struct pktcntr   drop_cnt;
+       u_int            marked_packets;
+};
+
+struct codel   *codel_alloc(int, int, int);
+void            codel_destroy(struct codel *);
+int             codel_addq(struct codel *, class_queue_t *, struct mbuf *);
+struct mbuf    *codel_getq(struct codel *, class_queue_t *);
+void            codel_getstats(struct codel *, struct codel_stats *);
+
+#endif /* _ALTQ_ALTQ_RED_H_ */
diff --git a/sys/altq/altq_priq.c b/sys/altq/altq_priq.c
index 9d3eb5d..5c04c48 100644
--- a/sys/altq/altq_priq.c
+++ b/sys/altq/altq_priq.c
@@ -247,6 +247,15 @@ priq_class_create(struct priq_if *pif, int pri, int 
qlimit, int flags, int qid)
        }
 #endif
 
+#ifndef ALTQ_CODEL
+       if (flags & PRCF_CODEL) {
+#ifdef ALTQ_DEBUG
+               printf("priq_class_create: CODEL not configured for PRIQ!\n");
+#endif
+               return (NULL);
+       }
+#endif
+
        if ((cl = pif->pif_classes[pri]) != NULL) {
                /* modify the class instead of creating a new one */
                s = splnet();
@@ -255,7 +264,11 @@ priq_class_create(struct priq_if *pif, int pri, int 
qlimit, int flags, int qid)
                splx(s);
 #ifdef ALTQ_RED
                if (q_is_red(cl->cl_q))
-                       red_destroy(cl->cl_red);
+                       red_destroy(cl->cl_aqm.red);
+#endif
+#ifdef ALTQ_CODEL
+               if (q_is_codel(cl->cl_q))
+                       codel_destroy(cl->cl_aqm.codel);
 #endif
        } else {
                cl = malloc(sizeof(struct priq_class), M_DEVBUF,
@@ -293,7 +306,7 @@ priq_class_create(struct priq_if *pif, int pri, int qlimit, 
int flags, int qid)
                        red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu
                          * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8);
                if (flags & PRCF_RED) {
-                       cl->cl_red = red_alloc(0, 0,
+                       cl->cl_aqm.red = red_alloc(0, 0,
                            qlimit(cl->cl_q) * 10/100,
                            qlimit(cl->cl_q) * 30/100,
                            red_flags, red_pkttime);
@@ -301,6 +314,12 @@ priq_class_create(struct priq_if *pif, int pri, int 
qlimit, int flags, int qid)
                }
        }
 #endif /* ALTQ_RED */
+#ifdef ALTQ_CODEL
+       if (flags & PRCF_CODEL) {
+               cl->cl_aqm.codel = codel_alloc(100, 5, 0);
+               qtype(cl->cl_q) = Q_CODEL;
+       }
+#endif /* ALTQ_CODEL */
 
        return (cl);
 }
@@ -329,10 +348,14 @@ priq_class_destroy(struct priq_class *cl)
        }
        splx(s);
 
-       if (cl->cl_red != NULL) {
+       if (cl->cl_aqm.red != NULL) {
 #ifdef ALTQ_RED
                if (q_is_red(cl->cl_q))
-                       red_destroy(cl->cl_red);
+                       red_destroy(cl->cl_aqm.red);
+#endif
+#ifdef ALTQ_CODEL
+               if (q_is_codel(cl->cl_q))
+                       codel_destroy(cl->cl_aqm.codel);
 #endif
        }
        free(cl->cl_q, M_DEVBUF);
@@ -426,7 +449,11 @@ priq_addq(struct priq_class *cl, struct mbuf *m)
 
 #ifdef ALTQ_RED
        if (q_is_red(cl->cl_q))
-               return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr);
+               return red_addq(cl->cl_aqm.red, cl->cl_q, m, cl->cl_pktattr);
+#endif
+#ifdef ALTQ_CODEL
+       if (q_is_codel(cl->cl_q))
+               return codel_addq(cl->cl_aqm.codel, cl->cl_q, m);
 #endif
        if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) {
                m_freem(m);
@@ -443,7 +470,11 @@ priq_getq(struct priq_class *cl)
 {
 #ifdef ALTQ_RED
        if (q_is_red(cl->cl_q))
-               return red_getq(cl->cl_red, cl->cl_q);
+               return red_getq(cl->cl_aqm.red, cl->cl_q);
+#endif
+#ifdef ALTQ_CODEL
+       if (q_is_codel(cl->cl_q))
+               return codel_getq(cl->cl_aqm.codel, cl->cl_q);
 #endif
        return _getq(cl->cl_q);
 }
@@ -483,7 +514,11 @@ get_class_stats(struct priq_classstats *sp, struct 
priq_class *cl)
        sp->qtype = qtype(cl->cl_q);
 #ifdef ALTQ_RED
        if (q_is_red(cl->cl_q))
-               red_getstats(cl->cl_red, &sp->red[0]);
+               red_getstats(cl->cl_aqm.red, &sp->red[0]);
+#endif
+#ifdef ALTQ_CODEL
+       if (q_is_codel(cl->cl_q))
+               codel_getstats(cl->cl_aqm.codel, &sp->codel);
 #endif
 }
 
diff --git a/sys/altq/altq_priq.h b/sys/altq/altq_priq.h
index b4bbf3a..7081425 100644
--- a/sys/altq/altq_priq.h
+++ b/sys/altq/altq_priq.h
@@ -32,6 +32,7 @@
 #include <altq/altq.h>
 #include <altq/altq_classq.h>
 #include <altq/altq_red.h>
+#include <altq/altq_codel.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -41,8 +42,9 @@ extern "C" {
 
 /* priq class flags */
 #define        PRCF_RED                0x0001  /* use RED */
-#define        PRCF_ECN                0x0002  /* use RED/ECN */
+#define        PRCF_ECN                0x0002  /* use ECN */
 #define        PRCF_RIO                0x0004  /* use RIO */
+#define PRCF_CODEL             0x0008  /* use CoDel */
 #define        PRCF_DEFAULTCLASS       0x1000  /* default class */
 
 /* special class handles */
@@ -60,6 +62,7 @@ struct priq_classstats {
        /* red and rio related info */
        int                     qtype;
        struct redstats         red[3]; /* rio has 3 red stats */
+       struct codel_stats      codel;
 };
 
 #ifdef _KERNEL
@@ -67,7 +70,10 @@ struct priq_classstats {
 struct priq_class {
        u_int32_t       cl_handle;      /* class handle */
        class_queue_t   *cl_q;          /* class queue structure */
-       struct red      *cl_red;        /* RED state */
+       union {
+               struct red      *red;   /* RED state */
+               struct codel    *codel; /* CoDel state */
+       } cl_aqm;
        int             cl_pri;         /* priority */
        int             cl_flags;       /* class flags */
        struct priq_if  *cl_pif;        /* back pointer to pif */
diff --git a/sys/altq/altq_var.h b/sys/altq/altq_var.h
index bda6527..9436a41 100644
--- a/sys/altq/altq_var.h
+++ b/sys/altq/altq_var.h
@@ -39,6 +39,9 @@
 #ifndef ALTQ_RED
 #define ALTQ_RED               /* RED is enabled by default */
 #endif
+#ifndef ALTQ_CODEL
+#define ALTQ_CODEL             /* CODEL is enabled by default */
+#endif
 #ifndef ALTQ_CBQ
 #define ALTQ_CBQ               /* CBQ is enabled by default */
 #endif
diff --git a/sys/conf/files b/sys/conf/files
index 3878a69..5f2f360 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -567,6 +567,7 @@ file        net/if_pppoe.c                  pppoe   
needs-flag
 # kernel sources
 file altq/altq_subr.c                  altq
 file altq/altq_red.c                   altq
+file altq/altq_codel.c                 altq
 file altq/altq_cbq.c                   altq
 file altq/altq_rmclass.c               altq
 file altq/altq_hfsc.c                  altq
diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h
index 7e90db7..195dfe1 100644
--- a/sys/sys/mbuf.h
+++ b/sys/sys/mbuf.h
@@ -104,6 +104,7 @@ struct      pkthdr {
        u_int16_t                csum_flags;    /* checksum flags */
        u_int16_t                ether_vtag;    /* Ethernet 802.1p+Q vlan tag */
        u_int                    rdomain;       /* routing domain id */
+       u_int64_t                enqueue_time;  /* enqueue time, for CoDel */
        struct pkthdr_pf         pf;
 };

Reply via email to