This patch changes HTB's class storage from hash+lists to a two-level linear 
array, so it can do constant time (O(1)) class lookup by classid. It improves 
scalability for large number of classes.

Without the patch, ~14k htb classes can starve a Xeon-3.2 at only 15kpps, 
using most of it's cycles traversing lists in htb_find(). The patch 
eliminates this problem, and has a measurable impact even with a few hundred 
classes.

Previously, scalability could be improved by increasing HTB_HSIZE, modify the 
hash function, and recompile, but this patch works for everyone without 
recompile and scales better too.

The patch is for 2.6.20-rc6, I have older ones for 2.6.18 and 2.6.19 if anyone 
is interested.

Signed-off-by: Simon Lodal <[EMAIL PROTECTED]>
--- linux-2.6.20-rc6.base/net/sched/sch_htb.c	2007-01-25 03:19:28.000000000 +0100
+++ linux-2.6.20-rc6/net/sched/sch_htb.c	2007-02-01 05:44:36.000000000 +0100
@@ -68,16 +68,37 @@
     one less than their parent.
 */
 
-#define HTB_HSIZE 16		/* classid hash size */
-#define HTB_EWMAC 2		/* rate average over HTB_EWMAC*HTB_HSIZE sec */
-#define HTB_RATECM 1		/* whether to use rate computer */
+#define HTB_MAX_CLS		(TC_H_MIN(-1)+1)
+#define HTB_CLS_ARRAY_SIZE	PAGE_SIZE
+#define HTB_CLS_PER_ARRAY	(HTB_CLS_ARRAY_SIZE / sizeof(void*))
+#define HTB_CLS_ARRAYS		(HTB_MAX_CLS / HTB_CLS_PER_ARRAY)
+#define HTB_CLS_ARRAY(CID)	(CID / HTB_CLS_PER_ARRAY)
+#define HTB_CLS_INDEX(CID)	(CID & (HTB_CLS_PER_ARRAY-1))
+
+
 #define HTB_HYSTERESIS 1	/* whether to use mode hysteresis for speedup */
-#define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
+#define HTB_VER 0x30012		/* major must be matched with number suplied by TC as version */
 
 #if HTB_VER >> 16 != TC_HTB_PROTOVER
 #error "Mismatched sch_htb.c and pkt_sch.h"
 #endif
 
+/* Whether to use rate computer. This is only used for statistics output to
+   userspace (tc -s class show dev ...); if you do not care about that and
+   want the last bit of performance, disable this. */
+#define HTB_RATECM
+#ifdef HTB_RATECM
+/* Time between rate computation updates, in seconds, for each class. */
+#define HTB_RATECM_UPDATE_INTERVAL	16
+/* How many HTB_RATECM_UPDATE_INTERVAL intervals to average over when
+   computing the rate. If set to 1, the computed rate will be exactly the
+   observed rate of the last interval. If set to higher values, the computed
+   rate will converge slower, but also vary less with small, temporary changes
+   in traffic.
+*/
+#define HTB_RATECM_AVERAGE_INTERVALS	2
+#endif
+
 /* used internaly to keep status of single class */
 enum htb_cmode {
 	HTB_CANT_SEND,		/* class can't send and can't borrow */
@@ -104,7 +125,7 @@
 	/* topology */
 	int level;		/* our level (see above) */
 	struct htb_class *parent;	/* parent class */
-	struct hlist_node hlist;	/* classid hash list item */
+	struct list_head clist;		/* classid list item */
 	struct list_head sibling;	/* sibling list item */
 	struct list_head children;	/* children list */
 
@@ -165,9 +186,13 @@
 	return rate->data[slot];
 }
 
+typedef struct htb_class* htb_cls_array[HTB_CLS_PER_ARRAY];
+
 struct htb_sched {
-	struct list_head root;	/* root classes list */
-	struct hlist_head hash[HTB_HSIZE];	/* hashed by classid */
+	struct list_head root;		/* root classes list */
+	struct list_head clist;		/* all classes list */
+	/* all classes arrays for fast lookup */
+	htb_cls_array* classes[HTB_CLS_ARRAYS];
 	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
 
 	/* self list - roots of self generating tree */
@@ -198,8 +223,10 @@
 	psched_time_t now;	/* cached dequeue time */
 	struct timer_list timer;	/* send delay timer */
 #ifdef HTB_RATECM
-	struct timer_list rttim;	/* rate computer timer */
-	int recmp_bucket;	/* which hash bucket to recompute next */
+	struct timer_list rttim;/* rate computer timer */
+	int clscnt;		/* total number of classes */
+	struct list_head *rtcur;/* current class to update rate timer for */
+	int rtiter;		/* current iteration (1..UPDATE_INTERVAL) */
 #endif
 
 	/* non shaped skbs; let them go directly thru */
@@ -209,32 +236,22 @@
 	long direct_pkts;
 };
 
-/* compute hash of size HTB_HSIZE for given handle */
-static inline int htb_hash(u32 h)
-{
-#if HTB_HSIZE != 16
-#error "Declare new hash for your HTB_HSIZE"
-#endif
-	h ^= h >> 8;		/* stolen from cbq_hash */
-	h ^= h >> 4;
-	return h & 0xf;
-}
-
-/* find class in global hash table using given handle */
+/* find class in class arrays using given handle */
 static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	struct hlist_node *p;
-	struct htb_class *cl;
+	htb_cls_array* a;
+	int minorid;
 
 	if (TC_H_MAJ(handle) != sch->handle)
 		return NULL;
 
-	hlist_for_each_entry(cl, p, q->hash + htb_hash(handle), hlist) {
-		if (cl->classid == handle)
-			return cl;
-	}
-	return NULL;
+	minorid = TC_H_MIN(handle);
+	a = q->classes[HTB_CLS_ARRAY(minorid)];
+	if (a == NULL)
+		return NULL;
+
+	return (*a)[HTB_CLS_INDEX(minorid)];
 }
 
 /**
@@ -689,13 +706,14 @@
 }
 
 #ifdef HTB_RATECM
-#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
+#define RT_GEN(R,S) R+=S-(R/HTB_RATECM_AVERAGE_INTERVALS);S=0
+/* recompute rate for approximately clscnt/UPDATE_INTERVAL classes */
 static void htb_rate_timer(unsigned long arg)
 {
 	struct Qdisc *sch = (struct Qdisc *)arg;
 	struct htb_sched *q = qdisc_priv(sch);
-	struct hlist_node *p;
 	struct htb_class *cl;
+	int cnt,done;
 
 
 	/* lock queue so that we can muck with it */
@@ -704,14 +722,35 @@
 	q->rttim.expires = jiffies + HZ;
 	add_timer(&q->rttim);
 
-	/* scan and recompute one bucket at time */
-	if (++q->recmp_bucket >= HTB_HSIZE)
-		q->recmp_bucket = 0;
-
-	hlist_for_each_entry(cl,p, q->hash + q->recmp_bucket, hlist) {
-		RT_GEN(cl->sum_bytes, cl->rate_bytes);
-		RT_GEN(cl->sum_packets, cl->rate_packets);
+	if (q->clscnt == 0)
+		goto done;
+	if (q->rtiter == HTB_RATECM_UPDATE_INTERVAL)
+		q->rtiter = 0;
+	if (q->rtiter == 0)
+		q->rtcur = q->clist.next;
+	q->rtiter++;
+	cnt = ((q->clscnt * q->rtiter) / HTB_RATECM_UPDATE_INTERVAL) -
+	      ((q->clscnt * (q->rtiter-1)) / HTB_RATECM_UPDATE_INTERVAL);
+	done = 0;
+	for (;;) {
+		/* end of class list */
+		if (q->rtcur == NULL)
+			break;
+		/* last iteration must continue until end */
+		if ((done == cnt) &&
+		    (q->rtiter < HTB_RATECM_UPDATE_INTERVAL)) {
+			break;
+		}
+		cl = list_entry(q->rtcur, struct htb_class, clist);
+		RT_GEN(cl->rate_bytes, cl->sum_bytes);
+		RT_GEN(cl->rate_packets, cl->sum_packets);
+		done++;
+		q->rtcur = q->rtcur->next;
+		if (q->rtcur == &q->clist)
+			q->rtcur = NULL;
 	}
+
+done:
 	spin_unlock_bh(&sch->dev->queue_lock);
 }
 #endif
@@ -1058,23 +1097,19 @@
 {
 	struct htb_sched *q = qdisc_priv(sch);
 	int i;
+	struct list_head *p;
 
-	for (i = 0; i < HTB_HSIZE; i++) {
-		struct hlist_node *p;
-		struct htb_class *cl;
-
-		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
-			if (cl->level)
-				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
-			else {
-				if (cl->un.leaf.q)
-					qdisc_reset(cl->un.leaf.q);
-				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
-			}
-			cl->prio_activity = 0;
-			cl->cmode = HTB_CAN_SEND;
-
+	list_for_each(p, &q->clist) {
+		struct htb_class *cl = list_entry(p, struct htb_class, clist);
+		if (cl->level)
+			memset(&cl->un.inner, 0, sizeof(cl->un.inner));
+		else {
+			if (cl->un.leaf.q) 
+				qdisc_reset(cl->un.leaf.q);
+			INIT_LIST_HEAD(&cl->un.leaf.drop_list);
 		}
+		cl->prio_activity = 0;
+		cl->cmode = HTB_CAN_SEND;
 	}
 	sch->flags &= ~TCQ_F_THROTTLED;
 	del_timer(&q->timer);
@@ -1109,8 +1144,7 @@
 	}
 
 	INIT_LIST_HEAD(&q->root);
-	for (i = 0; i < HTB_HSIZE; i++)
-		INIT_HLIST_HEAD(q->hash + i);
+	INIT_LIST_HEAD(&q->clist);
 	for (i = 0; i < TC_HTB_NUMPRIO; i++)
 		INIT_LIST_HEAD(q->drops + i);
 
@@ -1204,8 +1238,10 @@
 	struct htb_class *cl = (struct htb_class *)arg;
 
 #ifdef HTB_RATECM
-	cl->rate_est.bps = cl->rate_bytes / (HTB_EWMAC * HTB_HSIZE);
-	cl->rate_est.pps = cl->rate_packets / (HTB_EWMAC * HTB_HSIZE);
+	cl->rate_est.bps = cl->rate_bytes /
+		(HTB_RATECM_UPDATE_INTERVAL * HTB_RATECM_AVERAGE_INTERVALS);
+	cl->rate_est.pps = cl->rate_packets /
+		(HTB_RATECM_UPDATE_INTERVAL * HTB_RATECM_AVERAGE_INTERVALS);
 #endif
 
 	if (!cl->level && cl->un.leaf.q)
@@ -1310,6 +1346,7 @@
 static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
 {
 	struct htb_sched *q = qdisc_priv(sch);
+	int minorid = TC_H_MIN(cl->classid);
 
 	if (!cl->level) {
 		BUG_TRAP(cl->un.leaf.q);
@@ -1325,7 +1362,7 @@
 						  struct htb_class, sibling));
 
 	/* note: this delete may happen twice (see htb_delete) */
-	hlist_del_init(&cl->hlist);
+	list_del(&cl->clist);
 	list_del(&cl->sibling);
 
 	if (cl->prio_activity)
@@ -1334,6 +1371,7 @@
 	if (cl->cmode != HTB_CAN_SEND)
 		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
 
+	(*q->classes[HTB_CLS_ARRAY(minorid)])[HTB_CLS_INDEX(minorid)] = NULL;
 	kfree(cl);
 }
 
@@ -1341,6 +1379,7 @@
 static void htb_destroy(struct Qdisc *sch)
 {
 	struct htb_sched *q = qdisc_priv(sch);
+	int i;
 
 	del_timer_sync(&q->timer);
 #ifdef HTB_RATECM
@@ -1355,6 +1394,8 @@
 	while (!list_empty(&q->root))
 		htb_destroy_class(sch, list_entry(q->root.next,
 						  struct htb_class, sibling));
+	for (i=0; i<HTB_CLS_ARRAYS; i++)
+		kfree(q->classes[i]);
 
 	__skb_queue_purge(&q->direct_queue);
 }
@@ -1381,8 +1422,15 @@
 
 	sch_tree_lock(sch);
 
-	/* delete from hash and active; remainder in destroy_class */
-	hlist_del_init(&cl->hlist);
+	/* delete from class list and active; remainder in destroy_class */
+	if (q->rtcur == &cl->clist) {
+		q->rtcur = q->rtcur->next;
+		if (q->rtcur == &q->clist)
+			q->rtcur = NULL;
+	}
+	if (--q->clscnt == 0)
+		q->rtiter = 0;
+	list_del_init(&cl->clist);
 
 	if (!cl->level) {
 		qlen = cl->un.leaf.q->q.qlen;
@@ -1422,6 +1470,7 @@
 	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
 	struct rtattr *tb[TCA_HTB_RTAB];
 	struct tc_htb_opt *hopt;
+	int minorid = TC_H_MIN(classid);
 
 	/* extract all subattrs from opt attr */
 	if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) ||
@@ -1453,12 +1502,18 @@
 			goto failure;
 		}
 		err = -ENOBUFS;
+		if (q->classes[HTB_CLS_ARRAY(minorid)] == NULL) {
+			if ((q->classes[HTB_CLS_ARRAY(minorid)] = 
+			     kzalloc(sizeof(htb_cls_array), GFP_KERNEL))
+			    == NULL)
+				goto failure;
+		}
 		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
 			goto failure;
 
 		cl->refcnt = 1;
+		INIT_LIST_HEAD(&cl->clist);
 		INIT_LIST_HEAD(&cl->sibling);
-		INIT_HLIST_NODE(&cl->hlist);
 		INIT_LIST_HEAD(&cl->children);
 		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
 		RB_CLEAR_NODE(&cl->pq_node);
@@ -1503,10 +1558,12 @@
 		PSCHED_GET_TIME(cl->t_c);
 		cl->cmode = HTB_CAN_SEND;
 
-		/* attach to the hash list and parent's family */
-		hlist_add_head(&cl->hlist, q->hash + htb_hash(classid));
+		/* attach to the classes list+array and parent's family */
+		list_add_tail(&cl->clist, &q->clist);
 		list_add_tail(&cl->sibling,
 			      parent ? &parent->children : &q->root);
+		(*q->classes[HTB_CLS_ARRAY(minorid)])[HTB_CLS_INDEX(minorid)] = cl;
+		q->clscnt++;
 	} else
 		sch_tree_lock(sch);
 
@@ -1602,26 +1659,22 @@
 static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
 {
 	struct htb_sched *q = qdisc_priv(sch);
-	int i;
+	struct list_head *p;
 
 	if (arg->stop)
 		return;
 
-	for (i = 0; i < HTB_HSIZE; i++) {
-		struct hlist_node *p;
-		struct htb_class *cl;
-
-		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
-			if (arg->count < arg->skip) {
-				arg->count++;
-				continue;
-			}
-			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
-				arg->stop = 1;
-				return;
-			}
+	list_for_each(p, &q->clist) {
+		struct htb_class *cl = list_entry(p, struct htb_class, clist);
+		if (arg->count < arg->skip) {
 			arg->count++;
+			continue;
+		}
+		if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
+			arg->stop = 1;
+			return;
 		}
+		arg->count++;
 	}
 }
 
_______________________________________________
LARTC mailing list
LARTC@mailman.ds9a.nl
http://mailman.ds9a.nl/cgi-bin/mailman/listinfo/lartc

Reply via email to