Patrick McHardy wrote:
I've completed the port and tested it yesterday, unfortunately it's not useable in the real world as is. There is a strong bias against non-ECN flows because their packets are simply dropped instead of marked. At high load (50 ECN vs. 1 non-ECN flow) and a marking probability of about 10% the non-ECN flow simply stalls. I can send you the patch if you're interested ..
Thank you, I am really interested.
Patch is attached. Use the original tc patch.
I will try how it behaves for me in various circumstances.
What you say, though, is probably true - and the situation is even more accentuated when considering that different TCP stacks react to ECN and packet drops differently - a single drop percent will not be enough.
OTOH, with only ECN flows it works great, not a single packet drop after a couple of minutes, without BLUE there were multiple thousands.
Which ofcourse brings us to SFB - with Stochastic Fair Blue, the drop percentage for the non-ECN flow should be significantly lower and the connections should transfer more or less fairly.
I'm going to check this out, don't know anything about it.
Regards Patrick
-- Naked
# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
# 2004/03/06 16:57:42+01:00 [EMAIL PROTECTED]
# Add BLUE scheduler
#
# net/sched/sch_blue.c
# 2004/03/06 16:57:35+01:00 [EMAIL PROTECTED] +453 -0
#
# net/sched/sch_blue.c
# 2004/03/06 16:57:35+01:00 [EMAIL PROTECTED] +0 -0
# BitKeeper file /home/kaber/src/blue/linux-2.6/net/sched/sch_blue.c
#
# net/sched/Makefile
# 2004/03/06 16:57:35+01:00 [EMAIL PROTECTED] +1 -0
# Add BLUE scheduler
#
# net/sched/Kconfig
# 2004/03/06 16:57:35+01:00 [EMAIL PROTECTED] +19 -0
# Add BLUE scheduler
#
# include/linux/pkt_sched.h
# 2004/03/06 16:57:35+01:00 [EMAIL PROTECTED] +31 -0
# Add BLUE scheduler
#
diff -Nru a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
--- a/include/linux/pkt_sched.h Sat Mar 6 16:58:06 2004
+++ b/include/linux/pkt_sched.h Sat Mar 6 16:58:06 2004
@@ -321,6 +321,37 @@
TCA_HFSC_MAX = TCA_HFSC_USC
};
+/* BLUE section */
+
+enum
+{
+ TCA_BLUE_UNSPEC,
+ TCA_BLUE_PARMS,
+};
+
+struct tc_blue_qopt
+{
+ __u32 limit; /* Limit on queue length, bytes */
+ __s32 freeze_time; /* Minimum time between Pmark updates, usec! */
+ __s32 pmark_init; /* Initial Pmark */
+ __s32 pmark_inc; /* Pmark increment step */
+ __s32 pmark_dec; /* Pmark decrement step */
+ __s32 pmark_max; /* Pmark never exceeds this maximum */
+ __u32 flags;
+};
+
+/* Flags: */
+#define TC_BLUE_ECN 1 /* Do ECN marking, if ECT is set in the packet */
+
+struct tc_blue_xstats
+{
+ __s32 pmark; /* Current Pmark */
+ __u32 marked; /* Marked packets */
+ __u32 early_drops; /* Early drops */
+ __u32 limit_drops; /* Drops due to queue limits */
+ __u32 other_drops; /* Drops due to drop() calls */
+};
+
/* CBQ section */
#define TC_CBQ_MAXPRIO 8
diff -Nru a/net/sched/Kconfig b/net/sched/Kconfig
--- a/net/sched/Kconfig Sat Mar 6 16:58:06 2004
+++ b/net/sched/Kconfig Sat Mar 6 16:58:06 2004
@@ -101,6 +101,25 @@
To compile this code as a module, choose M here: the
module will be called sch_red.
+config NET_SCH_BLUE
+ tristate "BLUE queue"
+ depends on NET_SCHED
+ help
+ Say Y here if you want to use the BLUE queue management algorithm
+ for some of your network devices. BLUE is similar to RED as it
+ randomly ECN-marks or drops packets, but uses a different approach
+ to select the packets to be marked/dropped, which is intended to
+ perform better under high load.
+
+ For details and references about BLUE see:
+
+ - http://thefengs.com/wuchang/blue/
+ - http://www.sch.bme.hu/~bartoki/projects/thesis/
+ - The top of net/sched/sch_blue.c
+
+ To compile this code as a module, choose M here: the
+ module will be called sch_blue.
+
config NET_SCH_SFQ
tristate "SFQ queue"
depends on NET_SCHED
diff -Nru a/net/sched/Makefile b/net/sched/Makefile
--- a/net/sched/Makefile Sat Mar 6 16:58:06 2004
+++ b/net/sched/Makefile Sat Mar 6 16:58:06 2004
@@ -15,6 +15,7 @@
obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o
obj-$(CONFIG_NET_SCH_RED) += sch_red.o
obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o
+obj-$(CONFIG_NET_SCH_BLUE) += sch_blue.o
obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o
obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o
obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o
diff -Nru a/net/sched/sch_blue.c b/net/sched/sch_blue.c
--- /dev/null Wed Dec 31 16:00:00 1969
+++ b/net/sched/sch_blue.c Sat Mar 6 16:58:06 2004
@@ -0,0 +1,453 @@
+/*
+ * net/sched/sch_blue.c BLUE queue.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Author: Bartok Istvan, <[EMAIL PROTECTED]> <[EMAIL PROTECTED]>
+ * Credits: Based on sch_red.c written by Alexey Kuznetsov and others
+ *
+ * Changelog:
+ *
+ * Bartoki, 2001-Apr-10 - Initial version.
+ * Bartoki, 2001-Apr-11 - Added displaying Pmark in the statistics.
+ * - Added some moduletesting code (not to be included
+ * in the final distribution).
+ * Bartoki, 2001-Apr-24 - Added compile-time setting of algorithm variations.
+ * Bartoki, 2001-May-24 - Added initial Pmark to the parameters
+ * kaber, 2004-Mar-05 - Ported to 2.6, IPv6 ECN marking bug fixed,
+ * style changes
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/pkt_sched.h>
+#include <net/pkt_sched.h>
+#include <net/inet_ecn.h>
+#include <net/ip.h>
+
+/* The BLUE algorithm:
+ ===================
+
+ Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
+ "Blue: A New Class of Active Queue Management Algorithms"
+ U. Michigan CSE-TR-387-99, April 1999.
+
+ http://thefengs.com/wuchang/work/CSE-TR-387-99.pdf
+ http://thefengs.com/wuchang/blue/
+
+ This file codes the basic version of the algorithm as written down
+ on Page.5, Fig.3 of the paper. (Without Qave calculation)
+
+ From the paper:
+
+ "BLUE maintains a single probability Pm, which it uses to mark
+ (or drop) packets when they are enqueued. If the queue is continually
+ dropping packets due to queue overflow, BLUE increments Pm, thus
+ increasing the rate at which it sends back congestion notification.
+ Conversely, if the queue becomes empty or if the link is idle, BLUE
+ decreases its marking probability. This effectively allows BLUE to
+ 'learn' the correct rate it needs to send back congestion
+ notification."
+
+ Upon packet loss: if ( (now-last_update) > freeze_time ) {
+ pm += increment;
+ last_update = now;
+ }
+
+ Upon link idle event: if ( (now-last_update) > freeze_time ) {
+ pm -= decrement;
+ last_update = now;
+ }
+
+
+ Parameters, settable by user:
+ -----------------------------
+
+ limit - BYTES - Limit on queue length: BLUE will not enqueue
+ new packets while its backlog is greater than limit.
+ Note that backlog can exceed limit when a packet
+ gets enqueued for a backlog > (limit-MTU) queue,
+ or when one gets requeued.
+
+ pmark_init - see include/linux/pkt_sched.h
+ freeze_time
+ pmark_inc
+ pmark_dec
+ pmark_max
+ flags - For now, only one flag:
+
+ 'ecn' - if set, tries to ECN-mark packets instead of
+ early dropping (but drops if the packet is not
+ ECN-capable). As sch_red does, this module can ECN-mark
+ even if the kernel was compiled without ECN-support.
+
+
+ Implementation:
+ ---------------
+
+ Fractional fixed point format is used to represent Pmark:
+
+ sign 1/2 1/8
+ | | |
+ Float 0.625 will be: 0.1 0 1 0 0 0 0...[all 0s] = 0x50000000
+ | |
+ binary point 1/4
+
+ As only the [0.0, 1.0] range is used (1.0 is represented as
+ 0x7FFFFFFF = ~0.9999999995 for i386) this is an easy way do detect
+ over/underflows.
+
+ The queue is declared empty/idle when the dequeue function is called
+ with an empty queue. Note that the network device is not neccessarily
+ idle at that moment, but we could not get any much further with
+ estimating when is the end of the transmission as the hardware or
+ the device driver can have an internal buffer of few packets (to enable
+ end-to-end transmits) anyway.
+
+ For further details on design and performance analysis see:
+ http://www.sch.bme.hu/~bartoki/projects/thesis/
+
+*/
+
+
+/* 1 - Single last_update
+ * 0 - Separated timestamps for increase and decrease
+ */
+#define BLUE_SINGLE_UPDATE_TIME 0
+
+/* 1 - Try increase Pmark not just on a tail-drop event, but also when
+ * an enqueue occurs to a more then half-filled queue
+ * 0 - Try increase Pmark only on tail-drop
+ */
+#define BLUE_INCREASE_ON_HALFQ 1
+
+
+struct blue_sched_data
+{
+ struct tc_blue_qopt qopt; /* Parameters */
+ struct tc_blue_xstats xstats; /* BLUE-specific statistics */
+
+ int pmark; /* Packet marking probability */
+
+#if BLUE_SINGLE_UPDATE_TIME
+ psched_time_t last_update; /* Timestamp of the last pmark update */
+#else
+ psched_time_t last_inc; /* Timestamp of the last pmark increase */
+ psched_time_t last_dec; /* Timestamp of the last pmark decrease */
+#endif
+
+};
+
+
+/*
+ * Increments pmark
+ * Assumes 0 <= pmark and 0 <= pmark_inc
+ */
+static inline void blue_inc(struct blue_sched_data *q)
+{
+ q->pmark += q->qopt.pmark_inc;
+
+ /* On overflow or exceeding max use pmark_max: */
+ if (q->pmark < 0 || q->pmark > q->qopt.pmark_max)
+ q->pmark = q->qopt.pmark_max;
+}
+
+/*
+ * Increments pmark, but only if ((now - last_update) >= freeze_time)
+ */
+static inline void blue_try_inc(struct blue_sched_data *q)
+{
+ psched_time_t now;
+ psched_tdiff_t diff;
+
+ PSCHED_GET_TIME(now);
+
+#if BLUE_SINGLE_UPDATE_TIME
+ diff = PSCHED_TDIFF_SAFE(now, q->last_update, q->qopt.freeze_time, 0);
+#else
+ diff = PSCHED_TDIFF_SAFE(now, q->last_inc, q->qopt.freeze_time, 0);
+#endif
+
+ /*
+ * The time difference is plain long, it will wrap often on 32-bit
+ * architectures, so even if (last_update < now) is true, the diff
+ * can be negative.
+ *
+ * NOTE: this code still won't increase when:
+ * now = last_update + k*MAX_LONG + d
+ * where 0 <= d < freeze_time
+ */
+ if (diff >= q->qopt.freeze_time || diff < 0) {
+ blue_inc(q);
+#if BLUE_SINGLE_UPDATE_TIME
+ q->last_update = now;
+#else
+ q->last_inc = now;
+#endif
+ }
+}
+
+/*
+ * Decrements pmark
+ * Assumes 0 <= pmark and 0 <= pmark_dec
+ */
+static inline void blue_dec(struct blue_sched_data *q)
+{
+ q->pmark -= q->qopt.pmark_dec;
+
+ /* On underflow use 0 */
+ if (q->pmark < 0)
+ q->pmark = 0;
+}
+
+/*
+ * Decrements pmark, but only if ((now - last_update) >= freeze_time)
+ */
+static inline void blue_try_dec(struct blue_sched_data *q)
+{
+ psched_time_t now;
+ psched_tdiff_t diff;
+
+ PSCHED_GET_TIME(now);
+
+#if BLUE_SINGLE_UPDATE_TIME
+ diff = PSCHED_TDIFF_SAFE(now, q->last_update, q->qopt.freeze_time, 0);
+#else
+ diff = PSCHED_TDIFF_SAFE(now, q->last_dec, q->qopt.freeze_time, 0);
+#endif
+
+ /*
+ * For diff < 0 see the comment at the same place in blue_try_inc()
+ */
+ if (diff >= q->qopt.freeze_time || diff < 0) {
+ blue_dec(q);
+#if BLUE_SINGLE_UPDATE_TIME
+ q->last_update = now;
+#else
+ q->last_dec = now;
+#endif
+ }
+}
+
+/*
+ * Tries to ECN-mark the packet.
+ * Returns: 1 - success: marked now, or was marked already
+ * 0 - not marked: ECT bit is bot set in the packet,
+ * or not an IP packet
+ */
+static int blue_ecn_mark(struct sk_buff *skb)
+{
+ if (skb->nh.raw + 20 > skb->tail)
+ return 0;
+
+ switch (skb->protocol) {
+ case __constant_htons(ETH_P_IP):
+ if (!INET_ECN_is_capable(skb->nh.iph->tos))
+ return 0;
+ if (INET_ECN_is_not_ce(skb->nh.iph->tos))
+ IP_ECN_set_ce(skb->nh.iph);
+ return 1;
+ case __constant_htons(ETH_P_IPV6):
+ if (!INET_ECN_is_capable(ip6_get_dsfield(skb->nh.ipv6h)))
+ return 0;
+ IP6_ECN_set_ce(skb->nh.ipv6h);
+ return 1;
+ default:
+ return 0;
+ }
+}
+
+static int
+blue_enqueue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+
+ /* Try to mark or early drop with probability pmark: */
+ /* Operator < in the comparision allows true 0.0 and 1.0 probability */
+
+ if (((u32)net_random() >> 1) < q->pmark) {
+ goto mark;
+ } else {
+ goto enqueue;
+ }
+
+enqueue:
+ if (sch->stats.backlog <= q->qopt.limit) {
+
+#if BLUE_INCREASE_ON_HALFQ
+ if (sch->stats.backlog >= q->qopt.limit/2)
+ blue_try_inc(q);
+#endif
+
+ __skb_queue_tail(&sch->q, skb);
+ sch->stats.backlog += skb->len;
+ sch->stats.bytes += skb->len;
+ sch->stats.packets++;
+
+ return NET_XMIT_SUCCESS;
+ } else {
+ /* No space left, try to increment pmark and drop the packet */
+ blue_try_inc(q);
+ q->xstats.limit_drops++;
+ goto drop;
+ }
+
+mark:
+ sch->stats.overlimits++;
+ if ((q->qopt.flags & TC_BLUE_ECN) && blue_ecn_mark(skb)) {
+ q->xstats.marked++;
+ goto enqueue;
+ } else {
+ q->xstats.early_drops++;
+ goto drop;
+ }
+
+drop:
+ kfree_skb(skb);
+ sch->stats.drops++;
+ return NET_XMIT_CN;
+}
+
+static int
+blue_requeue(struct sk_buff *skb, struct Qdisc *sch)
+{
+ __skb_queue_head(&sch->q, skb);
+ sch->stats.backlog += skb->len;
+ return NET_XMIT_SUCCESS;
+}
+
+static struct sk_buff *
+blue_dequeue(struct Qdisc *sch)
+{
+ struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+ struct sk_buff *skb;
+
+ skb = __skb_dequeue(&sch->q);
+ if (skb) {
+ sch->stats.backlog -= skb->len;
+ return skb;
+ } else {
+ /* Queue empty, try to decrement pmark */
+ blue_try_dec(q);
+ return NULL;
+ }
+}
+
+static unsigned int
+blue_drop(struct Qdisc *sch)
+{
+ struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+ struct sk_buff *skb;
+ unsigned int len;
+
+ skb = __skb_dequeue_tail(&sch->q);
+ if (skb) {
+ sch->stats.backlog -= skb->len;
+ sch->stats.drops++;
+ q->xstats.other_drops++;
+ len = skb->len;
+ kfree_skb(skb);
+ return len;
+ } else {
+ return 0;
+ }
+}
+
+static void
+blue_reset(struct Qdisc *sch)
+{
+ struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+
+ __skb_queue_purge(&sch->q);
+ sch->stats.backlog = 0;
+ q->pmark = q->qopt.pmark_init;
+}
+
+static int
+blue_init(struct Qdisc *sch, struct rtattr *opt)
+{
+ struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+ struct rtattr *tb[TCA_BLUE_PARMS];
+ struct tc_blue_qopt *ctl;
+
+ if (opt == NULL
+ || rtattr_parse(tb, TCA_BLUE_PARMS, RTA_DATA(opt), RTA_PAYLOAD(opt))
+ || tb[TCA_BLUE_PARMS-1] == 0
+ || RTA_PAYLOAD(tb[TCA_BLUE_PARMS-1]) < sizeof(*ctl))
+ return -EINVAL;
+ ctl = RTA_DATA(tb[TCA_BLUE_PARMS-1]);
+
+ sch_tree_lock(sch);
+ q->qopt = *ctl;
+ q->pmark = q->qopt.pmark_init;
+ sch_tree_unlock(sch);
+ return 0;
+}
+
+static int blue_dump_xstats(struct blue_sched_data *q, struct sk_buff *skb)
+{
+ q->xstats.pmark = q->pmark;
+ RTA_PUT(skb, TCA_XSTATS, sizeof(q->xstats), &q->xstats);
+ return 0;
+
+rtattr_failure:
+ return 1;
+}
+
+static int
+blue_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct blue_sched_data *q = (struct blue_sched_data *)sch->data;
+ unsigned char *b = skb->tail;
+ struct rtattr *rta = (struct rtattr *)b;
+
+ RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
+ RTA_PUT(skb, TCA_BLUE_PARMS, sizeof(q->qopt), &q->qopt);
+ rta->rta_len = skb->tail - b;
+
+ if (blue_dump_xstats(q, skb))
+ goto rtattr_failure;
+ return skb->len;
+
+rtattr_failure:
+ skb_trim(skb, b - skb->data);
+ return -1;
+}
+
+struct Qdisc_ops blue_qdisc_ops =
+{
+ .id = "blue",
+ .init = blue_init,
+ .change = blue_init,
+ .reset = blue_reset,
+ .dump = blue_dump,
+ .enqueue = blue_enqueue,
+ .dequeue = blue_dequeue,
+ .requeue = blue_requeue,
+ .drop = blue_drop,
+ .priv_size = sizeof(struct blue_sched_data),
+ .owner = THIS_MODULE,
+};
+
+static int __init init_blue(void)
+{
+ return register_qdisc(&blue_qdisc_ops);
+}
+
+static void __exit exit_blue(void)
+{
+ unregister_qdisc(&blue_qdisc_ops);
+}
+
+MODULE_LICENSE("GPL");
+module_init(init_blue);
+module_exit(exit_blue);
