[vserver] O(1) s_context CPU scheduling

Sam Vilain Thu, 20 Feb 2003 05:12:09 -0800

Hi all,

I have a preliminary patch for the Alan Cox kernel for vserver, including 
O(1) scheduling.


This is implemented with a per-s_context `Token Bucket' that counts jiffies 
consumed by processes belonging to an s_context.  The token bucket has the 
following tuning options:

   * Fill rate (N tokens per Y jiffies)
   * Bucket size

In the timer tick function, a token is removed from the bucket of whichever 
security context is currently running.  At process rescheduling time, the 
number of owed tokens is calculated and added to the bucket.

The token bucket is assumed to be balanced when it is half full; in this 
condition, the process being scheduled gets no priority bonus.  If it is 
completely full, the process gets a -5 priority bonus.  If it is empty, 
+15 (it's parabolic; I tried geometric but it didn't seem to be enough for 
some simple test cases).

The scheduling is not `hard'; rather it relies on the feedback loop created 
between the token bucket and the process' priority.  In the case of only a 
few running processes in particular, it does not appear to do much.  But 
you can see the priority go up and down with `vtop'.

In theory, you could set a fraction of the CPU allocated per s_context, to 
individually control processor time allocated to each virtual server.  
However, I have yet to design a syscall interface for that and have 
currently hardcoded it as each s_context receiving 1/4 of the CPU.  This 
is set in kernel/sys.c.

The attached patch is against 2.4.20-ac2 with the 2.4.20ctx-16 patch 
applied.

I'd be interested in any comments or suggestions of how best to leverage 
this feature from userspace!
-- 
Sam Vilain, [EMAIL PROTECTED]

Real Programmers don't write in LISP.  Only faggot programs contain
more parenthesis than actual code.

diff -urN linux-2.4.20-ac2/Makefile linux-2.4.20-ac2-ctx16/Makefile
--- linux-2.4.20-ac2/Makefile	Fri Feb 21 00:51:46 2003
+++ linux-2.4.20-ac2-ctx16/Makefile	Wed Feb 19 22:29:33 2003
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 4
 SUBLEVEL = 20
-EXTRAVERSION = -ac2
+EXTRAVERSION = -ac2-ctx16
 
 KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
 
Binary files linux-2.4.20-ac2/arch/i386/boot/bzImage.ok and linux-2.4.20-ac2-ctx16/arch/i386/boot/bzImage.ok differ
diff -urN linux-2.4.20-ac2/fs/proc/array.c linux-2.4.20-ac2-ctx16/fs/proc/array.c
--- linux-2.4.20-ac2/fs/proc/array.c	Fri Feb 21 00:51:47 2003
+++ linux-2.4.20-ac2-ctx16/fs/proc/array.c	Thu Feb 20 22:09:00 2003
@@ -309,16 +309,25 @@
 		}
 		*buffer++ = ']';
 		*buffer++ = '\n';
-		buffer += sprintf (buffer,"ctxticks: %d %ld %d\n"
-			,atomic_read(&task->s_info->ticks),task->counter
-			,task->s_info->refcount);
+		buffer += sprintf (buffer,"ctxsched: %d/%d = (FR: %d per %d; ago %d)\n"
+				   "prio: %d (static prio = %d)\n"
+				   "ctxnproc: %d\n"
+				   ,task->s_info->tokens
+				   ,task->s_info->tokens_max
+				   ,task->s_info->tokens_fr
+				   ,task->s_info->tokens_div
+				   ,(jiffies - task->s_info->tokens_jfy)
+				   ,task->prio
+				   ,task->static_prio
+				   ,task->s_info->refcount);
+		buffer += sprintf (buffer, "tokens_left: %e\n", tokens_left(task));
 		buffer += sprintf (buffer,"ctxflags: %d\n"
 			,task->s_info->flags);
 		buffer += sprintf (buffer,"initpid: %d\n"
 			,task->s_info->initpid);
 	}else{
 		buffer += sprintf (buffer,"s_context: %d\n",task->s_context);
-		buffer += sprintf (buffer,"ctxticks: none\n");
+		buffer += sprintf (buffer,"ctxsched: none\n");
 		buffer += sprintf (buffer,"ctxflags: none\n");
 		buffer += sprintf (buffer,"initpid: none\n");
 	}
diff -urN linux-2.4.20-ac2/include/linux/sched.h linux-2.4.20-ac2-ctx16/include/linux/sched.h
--- linux-2.4.20-ac2/include/linux/sched.h	Fri Feb 21 00:51:48 2003
+++ linux-2.4.20-ac2-ctx16/include/linux/sched.h	Fri Feb 21 00:58:42 2003
@@ -174,12 +174,25 @@
  * user-space.  This allows kernel threads to set their
  * priority to a value higher than any user task. Note:
  * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ *
+ * Illustrative Example Priority Ranges:
+ *   0 ..  70 - Real Time kernel threads/head-room
+ *  71 ..  80 - nice -20 .. -11 tasks with bonuses
+ *  81 ..  90 - nice -10 .. -1  tasks with bonuses
+ *  91 ..  99 - nice 0 tasks with bonuses
+ *     100    - nice 0 with no bonuses or penalties
+ * 101 .. 109 - nice 0 tasks with penalties
+ * 110 .. 119 - nice 0 tasks with harsh penalties
+ * 120 .. 129 - nice +1 .. +10 tasks with harsh penalties
+ * 130 .. 139 - nice +11 .. +19 tasks with harsh penalties
+ *     149    - idle thread
  */
 
-#define MAX_USER_RT_PRIO	100
+#define MAX_USER_RT_PRIO	80
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
+#define MAX_PRIO_USER           120
+#define MAX_PRIO		(MAX_PRIO_USER + 29)
 
-#define MAX_PRIO		(MAX_RT_PRIO + 40)
 
 /*
  * The maximum RT priority is configurable.  If the resulting
@@ -339,10 +352,15 @@
 	char nodename[65];
 	char domainname[65];
 	int flags;		/* S_CTX_INFO_xxx */
-	atomic_t ticks;		/* Number of ticks used by all process */
-				/* in the s_context */
 	int initpid;		/* PID of the logical process 1 of the */
 				/* of the context */
+        int tokens;             /* number of CPU tokens in this context */
+	int tokens_fr;          /* Fill rate: add X tokens... */
+	int tokens_div;         /* Divisor:   per Y jiffies   */
+        int tokens_max;         /* Limit:     no more than N tokens */
+	unsigned long tokens_jfy; /* add an integral multiple of Y to this */
+	spinlock_t tokens_lock; /* lock for this structure */
+
 	void *data1;
 	void *data2;
 	void *data3;
@@ -560,8 +578,8 @@
     addr_limit:		KERNEL_DS,					\
     exec_domain:	&default_exec_domain,				\
     lock_depth:		-1,						\
-    prio:		MAX_PRIO-20,					\
-    static_prio:	MAX_PRIO-20,					\
+    prio:		MAX_USER_RT_PRIO+20,				\
+    static_prio:	MAX_USER_RT_PRIO+20,				\
     policy:		SCHED_OTHER,					\
     cpus_allowed:	-1,						\
     mm:			NULL,						\
@@ -1019,6 +1037,56 @@
 void sys_release_ip_info (struct iproot_info *);
 void sys_assign_ip_info (struct iproot_info *);
 void sys_alloc_ip_info (void);
+
+/* scheduling stuff */
+int tokens_left(struct task_struct *);
+int tokens_recalc(struct task_struct *);
+
+/* update the token allocation for a process */
+static inline void tokens_alloc(struct task_struct *tsk)
+{
+	if (tsk->s_info && (tsk->s_info->flags & S_CTX_INFO_SCHED))
+	{
+		spin_lock(&tsk->s_info->tokens_lock);
+
+		tokens_recalc(tsk);
+
+		/* else if (eat && tsk->s_info->tokens > 0)
+		   {
+		   tsk->s_info->tokens--;
+		   } */
+		spin_unlock(&tsk->s_info->tokens_lock);
+	}
+}
+
+/* eat a token, only allocating more if we run out.
+
+   Return:
+      1 if a token was eaten,
+      0 if the process has no tokens left
+     -1 if tokens do not apply
+*/
+static inline int eat_token(struct task_struct *tsk)
+{
+	int eaten = -1;
+	if (tsk->s_info && (tsk->s_info->flags & S_CTX_INFO_SCHED) )
+	{
+		spin_lock(&tsk->s_info->tokens_lock);
+		if (tsk->s_info->tokens || tokens_recalc(tsk))
+		{
+			tsk->s_info->tokens--;
+			eaten = 1;
+		}
+		else
+		{
+			eaten = 0;
+		}
+		spin_unlock(&tsk->s_info->tokens_lock);
+	}
+	return eaten;
+}
+
+
 
 static inline void clear_need_resched(void)
 {
diff -urN linux-2.4.20-ac2/kernel/sched.c linux-2.4.20-ac2-ctx16/kernel/sched.c
--- linux-2.4.20-ac2/kernel/sched.c	Fri Feb 21 00:51:48 2003
+++ linux-2.4.20-ac2-ctx16/kernel/sched.c	Fri Feb 21 01:07:13 2003
@@ -43,7 +43,7 @@
  */
 #define USER_PRIO(p)		((p)-MAX_RT_PRIO)
 #define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
+#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO_USER))
 
 /*
  * These are the 'tuning knobs' of the scheduler:
@@ -51,13 +51,17 @@
  * Minimum timeslice is 10 msecs, default timeslice is 150 msecs,
  * maximum timeslice is 300 msecs. Timeslices get refilled after
  * they expire.
+ *
+ * Note: when processes start getting to really low priorities they
+ *       will quickly hit the MIN_TIMESLICE.
  */
 #define MIN_TIMESLICE		( 10 * HZ / 1000)
 #define MAX_TIMESLICE		(300 * HZ / 1000)
 #define CHILD_PENALTY		95
 #define PARENT_PENALTY		100
 #define EXIT_WEIGHT		3
-#define PRIO_BONUS_RATIO	25
+#define PRIO_BONUS_RATIO	20
+#define VAVAVOOM_RATIO  	50
 #define INTERACTIVE_DELTA	2
 #define MAX_SLEEP_AVG		(2*HZ)
 #define STARVATION_LIMIT	(2*HZ)
@@ -90,15 +94,10 @@
  * too hard.
  */
 
-#define SCALE(v1,v1_max,v2_max) \
-	(v1) * (v2_max) / (v1_max)
-
-#define DELTA(p) \
-	(SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \
-		INTERACTIVE_DELTA)
-
+/* Note: the algorithm here has changed, but the above table hasn't */
 #define TASK_INTERACTIVE(p) \
-	((p)->prio <= (p)->static_prio - DELTA(p))
+	(p->sleep_avg - MAX_SLEEP_AVG*0.1 \
+	 > TASK_USER_PRIO(p)*MAX_SLEEP_AVG / MAX_USER_PRIO)
 
 /*
  * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ]
@@ -109,14 +108,20 @@
  * priority thread gets MIN_TIMESLICE worth of execution time.
  *
  * task_timeslice() is the interface that is used by the scheduler.
+ *
+ * IMHO this should be on a logarithmic rather than geometric scale.
  */
 #define BASE_TIMESLICE(p) (MIN_TIMESLICE + \
 	((MAX_TIMESLICE - MIN_TIMESLICE) * \
-	 (MAX_PRIO-1-(p)->static_prio)/(MAX_USER_PRIO - 1)))
+	 (MAX_PRIO_USER-1-(p)->prio)/(MAX_PRIO_USER - 1)))
 
 static inline unsigned int task_timeslice(task_t *p)
 {
-	return BASE_TIMESLICE(p);
+	if (p->prio >= MAX_PRIO_USER) {
+		return MIN_TIMESLICE;
+	} else {
+		return BASE_TIMESLICE(p);
+	}
 }
 
 /*
@@ -238,23 +243,52 @@
  * priority but is modified by bonuses/penalties.
  *
  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
- * into the -5 ... 0 ... +5 bonus/penalty range.
+ * into a -4 ... 0 ... +4 bonus/penalty range.
  *
- * We use 25% of the full 0...39 priority range so that:
+ * Additionally, we scale another amount based on the number of CPU
+ * tokens currently held by the s_context, if the process is part of
+ * an s_context (and the appropriate SCHED flag is set).  This ranges
+ * from -5 ... 0 ... +15.
+ *
+ * So, the total bonus is -9 .. 0 .. +19
+ *
+ * We use ~ 50% of the full 0...39 priority range so that:
  *
  * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
- * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
+ * 2) nice -20 CPU hogs do not get preemepted by nice 0 tasks, unless
+ *    that security context is far exceeding its CPU allocation.
  *
  * Both properties are important to certain workloads.
  */
 static inline int effective_prio(task_t *p)
 {
-	int bonus, prio;
+	int bonus, prio, vavavoom, max;
 
 	bonus = MAX_USER_PRIO*PRIO_BONUS_RATIO*p->sleep_avg/MAX_SLEEP_AVG/100 -
 			MAX_USER_PRIO*PRIO_BONUS_RATIO/100/2;
 
-	prio = p->static_prio - bonus;
+	/* lots of tokens = lots of vavavoom
+	        no tokens = no vavavoom      */
+	if ((vavavoom = tokens_left(p)) >= 0)
+	{
+		max = p->s_info->tokens_max;
+		vavavoom = max - vavavoom;
+		max = max * max;
+		vavavoom = MAX_USER_PRIO * VAVAVOOM_RATIO / 100
+			* (vavavoom*vavavoom - (max >> 2)) / max;
+
+		/*  alternative, geometric mapping
+		  vavavoom = -( MAX_USER_PRIO*VAVAVOOM_RATIO/100 * vavavoom
+		                / p->s_info->tokens_max -
+			        MAX_USER_PRIO*VAVAVOOM_RATIO/100/2    );*/
+	} else
+	{
+		vavavoom = 0;
+	}
+	/* vavavoom = ( MAX_USER_PRIO*VAVAVOOM_RATIO/100*tokens_left(p) -
+	   MAX_USER_PRIO*VAVAVOOM_RATIO/100/2    ); */
+
+	prio = p->static_prio - bonus + vavavoom;
 	if (prio < MAX_RT_PRIO)
 		prio = MAX_RT_PRIO;
 	if (prio > MAX_PRIO-1)
@@ -263,6 +297,56 @@
 }
 
 /*
+ * tokens_left - return the number of tokens in the process'
+ *               s_context's cpu token bucket, allocating more tokens
+ *               if necessary.
+ *
+ * Returns -1 if the process does not have a bucket.
+ */
+int tokens_left(struct task_struct *tsk)
+{
+
+	if (tsk->s_info && (tsk->s_info->flags & S_CTX_INFO_SCHED))
+	{
+		/* see if any more tokens are due */
+		tokens_alloc(tsk);
+
+		return tsk->s_info->tokens;
+	}
+	else
+	{
+		return -1;
+	}
+}
+
+/*
+ * tokens_recalc - recalculate the s_context's scheduling tokens
+ *
+ * a spinlock on tsk->s_info->tokens_lock must be acquired before
+ * calling this function.
+ */
+int tokens_recalc(struct task_struct *tsk)
+{
+	unsigned long diff;
+
+	diff = jiffies - tsk->s_info->tokens_jfy;
+
+	if (diff < tsk->s_info->tokens_div)
+	{
+		return 0;
+	}
+
+	diff = ( diff / tsk->s_info->tokens_div );
+
+	tsk->s_info->tokens     += diff * tsk->s_info->tokens_fr;
+	tsk->s_info->tokens_jfy += diff * tsk->s_info->tokens_div;
+	if (tsk->s_info->tokens > tsk->s_info->tokens_max)
+		tsk->s_info->tokens = tsk->s_info->tokens_max;
+
+	return 1;
+}
+
+/*
  * activate_task - move a task to the runqueue.
  *
  * Also update all the scheduling statistics stuff. (sleep average
@@ -865,6 +949,7 @@
 	 */
 	if (p->sleep_avg)
 		p->sleep_avg--;
+
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
diff -urN linux-2.4.20-ac2/kernel/sys.c linux-2.4.20-ac2-ctx16/kernel/sys.c
--- linux-2.4.20-ac2/kernel/sys.c	Fri Feb 21 00:51:48 2003
+++ linux-2.4.20-ac2-ctx16/kernel/sys.c	Thu Feb 20 23:11:54 2003
@@ -1073,7 +1073,15 @@
 		// printk ("new s_info %d\n",current->pid);
 		s_info->s_context[0] = current->s_context;
 		s_info->refcount = 1;
-		atomic_set (&s_info->ticks,current->counter);
+
+		/* scheduling; hard code as constants for now */
+		s_info->tokens_fr = 1;
+		s_info->tokens_div = 4;
+		s_info->tokens     = HZ * 5;
+		s_info->tokens_max = HZ * 10;
+		s_info->tokens_jfy = jiffies;
+		s_info->tokens_lock = SPIN_LOCK_UNLOCKED;
+
 		s_info->flags = 0;
 		s_info->initpid = 0;
 		down_read (&uts_sem);
diff -urN linux-2.4.20-ac2/kernel/timer.c linux-2.4.20-ac2-ctx16/kernel/timer.c
--- linux-2.4.20-ac2/kernel/timer.c	Fri Feb 21 00:51:48 2003
+++ linux-2.4.20-ac2-ctx16/kernel/timer.c	Wed Feb 19 22:31:02 2003
@@ -599,11 +599,8 @@
 	struct task_struct *p = current;
 	int cpu = smp_processor_id(), system = user_tick ^ 1;
 
-		if (p->s_info != NULL
-			&& (p->s_info->flags & S_CTX_INFO_SCHED)!=0){
-			// atomic_sub (ticks*p->s_info->refcount, &p->s_info->ticks);
-			atomic_dec (&p->s_info->ticks);
-		}
+	eat_token(p);
+
 	update_one_process(p, user_tick, system, cpu);
 	scheduler_tick(user_tick, system);
 }

[vserver] O(1) s_context CPU scheduling

Reply via email to