Hi tech@, I started implementing CoDel[1] in altq, as an alternative to RED. It's based on the Linux implementation, which is dual-licensed GPL/BSD. I'm looking for early feedback (design, big issues, etc.). Work-in-progress diff is below.
The big issues I see are: - CoDel needs to timestamp each packet when it gets queued. - I added a new member in struct pkthdr for this. Is this ok? - I end up calling microuptime() for each packet. Is this ok? - CoDel drops packets at dequeue time, contrary to RED. This means that for ECN we need to have the header available, but it is long gone. Altq makes the header available to red_enqueue() then forgets about it. I'll figure out a way to do this I'm sure (either copy or recompute), but suggestions are appreciated. Any other suggestions/comments/questions very much appreciated. Thanks, Simon [1] http://www.bufferbloat.net/projects/codel/wiki THIS DIFF DOESN'T WORK. I NEVER TESTED IT. IT'S A WORK IN PROGRESS. diff --git a/sys/altq/altq_classq.h b/sys/altq/altq_classq.h index a9ac674..43863b6 100644 --- a/sys/altq/altq_classq.h +++ b/sys/altq/altq_classq.h @@ -44,10 +44,11 @@ extern "C" { #endif /* - * Packet Queue types: RED or DROPHEAD. + * Packet Queue types */ #define Q_DROPHEAD 0x00 #define Q_RED 0x01 +#define Q_CODEL 0x02 #define Q_DROPTAIL 0x03 #ifdef _KERNEL @@ -58,6 +59,7 @@ extern "C" { struct _class_queue_ { struct mbuf *tail_; /* Tail of packet queue */ int qlen_; /* Queue length (in number of packets) */ + int qsize_; /* Queue size (in bytes) */ int qlim_; /* Queue limit (in number of packets*) */ int qtype_; /* Queue type */ }; @@ -67,11 +69,13 @@ typedef struct _class_queue_ class_queue_t; #define qtype(q) (q)->qtype_ /* Get queue type */ #define qlimit(q) (q)->qlim_ /* Max packets to be queued */ #define qlen(q) (q)->qlen_ /* Current queue length. */ +#define qsize(q) (q)->qsize_ /* Current queue size. */ #define qtail(q) (q)->tail_ /* Tail of the queue */ #define qhead(q) ((q)->tail_ ? (q)->tail_->m_nextpkt : NULL) #define qempty(q) ((q)->qlen_ == 0) /* Is the queue empty?? */ #define q_is_red(q) ((q)->qtype_ == Q_RED) /* Is the queue a red queue */ +#define q_is_codel(q) ((q)->qtype_ == Q_CODEL)/* Is the queue a CoDel queue */ #if !defined(__GNUC__) || defined(ALTQ_DEBUG) @@ -98,6 +102,7 @@ _addq(class_queue_t *q, struct mbuf *m) m0->m_nextpkt = m; qtail(q) = m; qlen(q)++; + qsize(q) += m_pktlen(m); } static __inline struct mbuf * @@ -112,6 +117,7 @@ _getq(class_queue_t *q) else qtail(q) = NULL; qlen(q)--; + qsize(q) -= m_pktlen(m0); m0->m_nextpkt = NULL; return (m0); } diff --git a/sys/altq/altq_codel.c b/sys/altq/altq_codel.c new file mode 100644 index 0000000..7a6c186 --- /dev/null +++ b/sys/altq/altq_codel.c @@ -0,0 +1,241 @@ +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/socket.h> +#include <sys/systm.h> + +#include <net/if.h> + +#include <altq/altq.h> +#include <altq/altq_codel.h> + +/** + * struct codel_params - contains codel parameters + * @target: target queue size (in time units) + * @interval: width of moving time window + * @ecn: is Explicit Congestion Notification enabled + */ +struct codel_params { + u_int64_t target; + u_int64_t interval; + int ecn; +}; + +/** + * struct codel_vars - contains codel variables + * @count: how many drops we've done since the last time we + * entered dropping state + * @lastcount: count at entry to dropping state + * @dropping: set to true if in dropping state + * @rec_inv_sqrt: reciprocal value of sqrt(count) >> 1 + * @first_above_time: when we went (or will go) continuously above target + * for interval + * @drop_next: time to drop next packet, or when we dropped last + * @ldelay: sojourn time of last dequeued packet + */ +struct codel_vars { + u_int32_t count; + u_int32_t lastcount; + int dropping; + u_int16_t rec_inv_sqrt; + u_int64_t first_above_time; + u_int64_t drop_next; + u_int64_t ldelay; +}; + +struct codel { + struct codel_params params; + struct codel_vars vars; + struct codel_stats stats; + u_int32_t drop_overlimit; +}; + +static int codel_should_drop(struct codel *, class_queue_t *, + struct mbuf *, u_int64_t); +static void codel_Newton_step(struct codel_vars *); +static u_int64_t codel_control_law(u_int64_t t, u_int64_t, u_int32_t); + +#define codel_time_after(a, b) ((int64_t)(a) - (int64_t)(b) > 0) +#define codel_time_after_eq(a, b) ((int64_t)(a) - (int64_t)(b) >= 0) +#define codel_time_before(a, b) ((int64_t)(a) - (int64_t)(b) < 0) +#define codel_time_before_eq(a, b) ((int64_t)(a) - (int64_t)(b) <= 0) + +struct codel * +codel_alloc(int target, int interval, int ecn) +{ + struct codel *c; + + c = malloc(sizeof(*c), M_DEVBUF, M_WAITOK|M_ZERO); + + c->params.target = machclk_freq * target / 1000; + c->params.interval = machclk_freq * interval / 1000; + c->params.ecn = ecn; + c->stats.maxpacket = 256; + + return (c); +} + +void +codel_destroy(struct codel *c) +{ + free(c, M_DEVBUF); +} + +int +codel_addq(struct codel *c, class_queue_t *q, struct mbuf *m) +{ + if (qlen(q) < qlimit(q)) { + m->m_pkthdr.enqueue_time = read_machclk(); + _addq(q, m); + return (0); + } + c->drop_overlimit++; + m_freem(m); + return (-1); +} + +static int +codel_should_drop(struct codel *c, class_queue_t *q, struct mbuf *m, + u_int64_t now) +{ + if (m == NULL) { + c->vars.first_above_time = 0; + return (0); + } + + c->vars.ldelay = now - m->m_pkthdr.enqueue_time; + c->stats.maxpacket = MAX(c->stats.maxpacket, m_pktlen(m)); + + if (codel_time_before(c->vars.ldelay, c->params.target) || + qsize(q) <= c->stats.maxpacket) { + /* went below - stay below for at least interval */ + c->vars.first_above_time = 0; + return (0); + } + if (c->vars.first_above_time == 0) { + /* just went above from below. If we stay above + * for at least interval we'll say it's ok to drop + */ + c->vars.first_above_time = now + c->params.interval; + return (0); + } + if (codel_time_after(now, c->vars.first_above_time)) + return (1); + return (0); +} + +/* + * http://en.wikipedia.org/wiki/Methods_of_computing_square_roots#Iterative_methods_for_reciprocal_square_roots + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) + * + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 + */ +static void codel_Newton_step(struct codel_vars *vars) +{ +#define REC_INV_SQRT_BITS (8 * sizeof(u_int16_t)) /* or sizeof_in_bits(rec_inv_sqrt) */ +/* needed shift to get a Q0.32 number from rec_inv_sqrt */ +#define REC_INV_SQRT_SHIFT (32 - REC_INV_SQRT_BITS) + + u_int32_t invsqrt = ((u_int32_t)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT; + u_int32_t invsqrt2 = ((u_int64_t)invsqrt * invsqrt) >> 32; + u_int64_t val = (3LL << 32) - ((u_int64_t)vars->count * invsqrt2); + + val >>= 2; /* avoid overflow in following multiply */ + val = (val * invsqrt) >> (32 - 2 + 1); + + vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT; +} + +static u_int64_t +codel_control_law(u_int64_t t, u_int64_t interval, u_int32_t rec_inv_sqrt) +{ + return (t + (u_int32_t)(((u_int64_t)interval * + (rec_inv_sqrt << REC_INV_SQRT_SHIFT)) >> 32)); +} + +struct mbuf * +codel_getq(struct codel *c, class_queue_t *q) +{ + struct mbuf *m; + u_int64_t now; + int drop; + + if ((m = _getq(q)) == NULL) { + c->vars.dropping = 0; + return (m); + } + + now = read_machclk(); + drop = codel_should_drop(c, q, m, now); + if (c->vars.dropping) { + if (!drop) { + /* sojourn time below target - leave dropping state */ + c->vars.dropping = 0; + } else if (codel_time_after_eq(now, c->vars.drop_next)) { + /* It's time for the next drop. Drop the current + * packet and dequeue the next. The dequeue might + * take us out of dropping state. + * If not, schedule the next drop. + * A large backlog might result in drop rates so high + * that the next drop should happen now, + * hence the while loop. + */ + while (c->vars.dropping && + codel_time_after_eq(now, c->vars.drop_next)) { + c->vars.count++; /* don't care of possible wrap + * since there is no more + * divide */ + codel_Newton_step(&c->vars); + /* TODO ECN */ + PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m)); + m_freem(m); + m = _getq(q); + if (!codel_should_drop(c, q, m, now)) + /* leave dropping state */ + c->vars.dropping = 0; + else + /* and schedule the next drop */ + c->vars.drop_next = + codel_control_law(c->vars.drop_next, + c->params.interval, + c->vars.rec_inv_sqrt); + } + } + } else if (drop) { + /* TODO ECN */ + PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m)); + m_freem(m); + + m = _getq(q); + drop = codel_should_drop(c, q, m, now); + + c->vars.dropping = 1; + /* if min went above target close to when we last went below it + * assume that the drop rate that controlled the queue on the + * last cycle is a good starting point to control it now. + */ + if (codel_time_before(now - c->vars.drop_next, + 16 * c->params.interval)) { + c->vars.count = (c->vars.count - c->vars.lastcount) | 1; + /* we dont care if rec_inv_sqrt approximation + * is not very precise : + * Next Newton steps will correct it quadratically. + */ + codel_Newton_step(&c->vars); + } else { + c->vars.count = 1; + c->vars.rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT; + } + c->vars.lastcount = c->vars.count; + c->vars.drop_next = codel_control_law(now, c->params.interval, + c->vars.rec_inv_sqrt); + } + + return m; +} + +void +codel_getstats(struct codel *c, struct codel_stats *s) +{ + *s = c->stats; +} diff --git a/sys/altq/altq_codel.h b/sys/altq/altq_codel.h new file mode 100644 index 0000000..4daae2d --- /dev/null +++ b/sys/altq/altq_codel.h @@ -0,0 +1,20 @@ +#ifndef _ALTQ_ALTQ_CODEL_H_ +#define _ALTQ_ALTQ_CODEL_H_ + +#include <altq/altq_classq.h> + +struct codel; + +struct codel_stats { + u_int32_t maxpacket; + struct pktcntr drop_cnt; + u_int marked_packets; +}; + +struct codel *codel_alloc(int, int, int); +void codel_destroy(struct codel *); +int codel_addq(struct codel *, class_queue_t *, struct mbuf *); +struct mbuf *codel_getq(struct codel *, class_queue_t *); +void codel_getstats(struct codel *, struct codel_stats *); + +#endif /* _ALTQ_ALTQ_RED_H_ */ diff --git a/sys/altq/altq_priq.c b/sys/altq/altq_priq.c index 9d3eb5d..5c04c48 100644 --- a/sys/altq/altq_priq.c +++ b/sys/altq/altq_priq.c @@ -247,6 +247,15 @@ priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid) } #endif +#ifndef ALTQ_CODEL + if (flags & PRCF_CODEL) { +#ifdef ALTQ_DEBUG + printf("priq_class_create: CODEL not configured for PRIQ!\n"); +#endif + return (NULL); + } +#endif + if ((cl = pif->pif_classes[pri]) != NULL) { /* modify the class instead of creating a new one */ s = splnet(); @@ -255,7 +264,11 @@ priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid) splx(s); #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) - red_destroy(cl->cl_red); + red_destroy(cl->cl_aqm.red); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_destroy(cl->cl_aqm.codel); #endif } else { cl = malloc(sizeof(struct priq_class), M_DEVBUF, @@ -293,7 +306,7 @@ priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid) red_pkttime = (int64_t)pif->pif_ifq->altq_ifp->if_mtu * 1000 * 1000 * 1000 / (pif->pif_bandwidth / 8); if (flags & PRCF_RED) { - cl->cl_red = red_alloc(0, 0, + cl->cl_aqm.red = red_alloc(0, 0, qlimit(cl->cl_q) * 10/100, qlimit(cl->cl_q) * 30/100, red_flags, red_pkttime); @@ -301,6 +314,12 @@ priq_class_create(struct priq_if *pif, int pri, int qlimit, int flags, int qid) } } #endif /* ALTQ_RED */ +#ifdef ALTQ_CODEL + if (flags & PRCF_CODEL) { + cl->cl_aqm.codel = codel_alloc(100, 5, 0); + qtype(cl->cl_q) = Q_CODEL; + } +#endif /* ALTQ_CODEL */ return (cl); } @@ -329,10 +348,14 @@ priq_class_destroy(struct priq_class *cl) } splx(s); - if (cl->cl_red != NULL) { + if (cl->cl_aqm.red != NULL) { #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) - red_destroy(cl->cl_red); + red_destroy(cl->cl_aqm.red); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_destroy(cl->cl_aqm.codel); #endif } free(cl->cl_q, M_DEVBUF); @@ -426,7 +449,11 @@ priq_addq(struct priq_class *cl, struct mbuf *m) #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) - return red_addq(cl->cl_red, cl->cl_q, m, cl->cl_pktattr); + return red_addq(cl->cl_aqm.red, cl->cl_q, m, cl->cl_pktattr); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + return codel_addq(cl->cl_aqm.codel, cl->cl_q, m); #endif if (qlen(cl->cl_q) >= qlimit(cl->cl_q)) { m_freem(m); @@ -443,7 +470,11 @@ priq_getq(struct priq_class *cl) { #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) - return red_getq(cl->cl_red, cl->cl_q); + return red_getq(cl->cl_aqm.red, cl->cl_q); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + return codel_getq(cl->cl_aqm.codel, cl->cl_q); #endif return _getq(cl->cl_q); } @@ -483,7 +514,11 @@ get_class_stats(struct priq_classstats *sp, struct priq_class *cl) sp->qtype = qtype(cl->cl_q); #ifdef ALTQ_RED if (q_is_red(cl->cl_q)) - red_getstats(cl->cl_red, &sp->red[0]); + red_getstats(cl->cl_aqm.red, &sp->red[0]); +#endif +#ifdef ALTQ_CODEL + if (q_is_codel(cl->cl_q)) + codel_getstats(cl->cl_aqm.codel, &sp->codel); #endif } diff --git a/sys/altq/altq_priq.h b/sys/altq/altq_priq.h index b4bbf3a..7081425 100644 --- a/sys/altq/altq_priq.h +++ b/sys/altq/altq_priq.h @@ -32,6 +32,7 @@ #include <altq/altq.h> #include <altq/altq_classq.h> #include <altq/altq_red.h> +#include <altq/altq_codel.h> #ifdef __cplusplus extern "C" { @@ -41,8 +42,9 @@ extern "C" { /* priq class flags */ #define PRCF_RED 0x0001 /* use RED */ -#define PRCF_ECN 0x0002 /* use RED/ECN */ +#define PRCF_ECN 0x0002 /* use ECN */ #define PRCF_RIO 0x0004 /* use RIO */ +#define PRCF_CODEL 0x0008 /* use CoDel */ #define PRCF_DEFAULTCLASS 0x1000 /* default class */ /* special class handles */ @@ -60,6 +62,7 @@ struct priq_classstats { /* red and rio related info */ int qtype; struct redstats red[3]; /* rio has 3 red stats */ + struct codel_stats codel; }; #ifdef _KERNEL @@ -67,7 +70,10 @@ struct priq_classstats { struct priq_class { u_int32_t cl_handle; /* class handle */ class_queue_t *cl_q; /* class queue structure */ - struct red *cl_red; /* RED state */ + union { + struct red *red; /* RED state */ + struct codel *codel; /* CoDel state */ + } cl_aqm; int cl_pri; /* priority */ int cl_flags; /* class flags */ struct priq_if *cl_pif; /* back pointer to pif */ diff --git a/sys/altq/altq_var.h b/sys/altq/altq_var.h index bda6527..9436a41 100644 --- a/sys/altq/altq_var.h +++ b/sys/altq/altq_var.h @@ -39,6 +39,9 @@ #ifndef ALTQ_RED #define ALTQ_RED /* RED is enabled by default */ #endif +#ifndef ALTQ_CODEL +#define ALTQ_CODEL /* CODEL is enabled by default */ +#endif #ifndef ALTQ_CBQ #define ALTQ_CBQ /* CBQ is enabled by default */ #endif diff --git a/sys/conf/files b/sys/conf/files index 3878a69..5f2f360 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -567,6 +567,7 @@ file net/if_pppoe.c pppoe needs-flag # kernel sources file altq/altq_subr.c altq file altq/altq_red.c altq +file altq/altq_codel.c altq file altq/altq_cbq.c altq file altq/altq_rmclass.c altq file altq/altq_hfsc.c altq diff --git a/sys/sys/mbuf.h b/sys/sys/mbuf.h index 7e90db7..195dfe1 100644 --- a/sys/sys/mbuf.h +++ b/sys/sys/mbuf.h @@ -104,6 +104,7 @@ struct pkthdr { u_int16_t csum_flags; /* checksum flags */ u_int16_t ether_vtag; /* Ethernet 802.1p+Q vlan tag */ u_int rdomain; /* routing domain id */ + u_int64_t enqueue_time; /* enqueue time, for CoDel */ struct pkthdr_pf pf; };
