On Wed, Jan 18, 2012 at 04:41:35PM +0100, Gregor Best wrote:
> after a long time of silence here's a second iteration of the patch.
> I've addressed a few concerns voiced here:
> 
> * Process lookup and friends now have O(log n) runtime. I achieved that
>   by abusing RB-trees as priority queues since they have runtime
>   O(log n) in all relevant algorithms.

I'm wondering if a heap tree isn't more suitable. But since they're
currently not in the kernel, I guess the RB tree is the best solution.

> * The algorithm for calculating a new deadline for a given process has
>   been simplified and is now documented a bit better. It also derives
>   the deadline offset from the value of hz (via rrticks_init) as
>   suggested by Miod (?).
> * CPU rlimits are now honored again. The relevant code did not change, the
>   new patch just doesn't remove rlimit enforcement anymore.
> * Timeslices are 20ms long instead of 10ms. This solves the issue of 0ms
>   long timeslices on machines with hz < 100.

I'm not really happy having timeslices get larger. They're considerably
large already. Can you think of a way to derive a sane value based on
the hz?

> With recent improvements in the mainline scheduler and especially
> rthreads, the performance of the patched scheduler and mainline is now
> roughly similar, at least if throughput is concerned. I have the feeling
> that the system behaves "snappier" with my patch, but that might be some
> sort of placebo effect. I haven't yet come up with a reliable method to
> benchmark interactivity except for actually using the machine and doing
> stuff on it. It's interesting to note however that the patched scheduler
> achieves a performance similar to the default one without all the fancy
> methods for calculating how expensive it is to move a process from one
> CPU to another or related methods for preserving cache locality.

You'll want to test this on a multi-socket machine as well (preferably a
numa machine). You're currently migrating processes using your on-die L3
cache, which is considerably faster than memory.

> I use the patched scheduler exclusively on my Core2Duo machine with an
> MP build.
>
> The amount of lines removed versus added lines by this patch shifted
> towards "more added lines" but is still at 173 lines less than the
> default.
> 
> Once again, comments, rants, insults, everything is welcome :)

Comments on the code inline. This is just me glancing over it, I'll try
and get more in-depth over the weekend.


> Index: sys/proc.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/proc.h,v
> retrieving revision 1.149
> diff -u -r1.149 proc.h
> --- sys/proc.h        7 Jan 2012 05:38:12 -0000       1.149
> +++ sys/proc.h        17 Jan 2012 16:10:09 -0000
> @@ -43,6 +43,7 @@
>  #include <machine/proc.h>            /* Machine-dependent proc substruct. */
>  #include <sys/selinfo.h>             /* For struct selinfo */
>  #include <sys/queue.h>
> +#include <sys/tree.h>
>  #include <sys/timeout.h>             /* For struct timeout */
>  #include <sys/event.h>                       /* For struct klist */
>  #include <sys/mutex.h>                       /* For struct mutex */
> @@ -210,7 +211,9 @@
>  #define      PS_SINGLEUNWIND _P_SINGLEUNWIND
>  
>  struct proc {
> -     TAILQ_ENTRY(proc) p_runq;
> +     RB_ENTRY(proc) p_runq;
> +     RB_ENTRY(proc) p_expq;
> +     TAILQ_ENTRY(proc) p_slpq;
>       LIST_ENTRY(proc) p_list;        /* List of all processes. */
>  
>       struct  process *p_p;           /* The process of this thread. */
> @@ -251,6 +254,9 @@
>  #endif
>  
>       /* scheduling */
> +     struct timeval p_deadline; /* virtual deadline used for scheduling */
> +     struct timeval p_deadline_set; /* time at which the deadline was set */
> +     int  p_rrticks; /* number of ticks this process is allowed to stay on a 
> processor */
>       u_int   p_estcpu;        /* Time averaged value of p_cpticks. */
>       int     p_cpticks;       /* Ticks of cpu time. */
>       fixpt_t p_pctcpu;        /* %cpu for this process during p_swtime */
> Index: sys/sched.h
> ===================================================================
> RCS file: /cvs/src/sys/sys/sched.h,v
> retrieving revision 1.30
> diff -u -r1.30 sched.h
> --- sys/sched.h       16 Nov 2011 20:50:19 -0000      1.30
> +++ sys/sched.h       17 Jan 2012 16:10:09 -0000
> @@ -87,8 +87,6 @@
>  #define CP_IDLE              4
>  #define CPUSTATES    5
>  
> -#define      SCHED_NQS       32                      /* 32 run queues. */
> -
>  /*
>   * Per-CPU scheduler state.
>   * XXX - expose to userland for now.
> @@ -99,7 +97,6 @@
>       u_int spc_schedticks;           /* ticks for schedclock() */
>       u_int64_t spc_cp_time[CPUSTATES]; /* CPU state statistics */
>       u_char spc_curpriority;         /* usrpri of curproc */
> -     int spc_rrticks;                /* ticks until roundrobin() */
>       int spc_pscnt;                  /* prof/stat counter */
>       int spc_psdiv;                  /* prof/stat divisor */ 
>       struct proc *spc_idleproc;      /* idle proc for this cpu */
> @@ -107,9 +104,6 @@
>       u_int spc_nrun;                 /* procs on the run queues */
>       fixpt_t spc_ldavg;              /* shortest load avg. for this cpu */
>  
> -     TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
> -     volatile uint32_t spc_whichqs;
> -
>  #ifdef notyet
>       struct proc *spc_reaper;        /* dead proc reaper */
>  #endif
> @@ -119,18 +113,16 @@
>  #ifdef       _KERNEL
>  
>  /* spc_flags */
> -#define SPCF_SEENRR             0x0001  /* process has seen roundrobin() */
> -#define SPCF_SHOULDYIELD        0x0002  /* process should yield the CPU */
> -#define SPCF_SWITCHCLEAR        (SPCF_SEENRR|SPCF_SHOULDYIELD)
> -#define SPCF_SHOULDHALT              0x0004  /* CPU should be vacated */
> -#define SPCF_HALTED          0x0008  /* CPU has been halted */
> +#define SPCF_SHOULDYIELD        0x0001  /* process should yield the CPU */
> +#define SPCF_SHOULDHALT              0x0002  /* CPU should be vacated */
> +#define SPCF_HALTED          0x0004  /* CPU has been halted */
>  
> -#define      SCHED_PPQ       (128 / SCHED_NQS)       /* priorities per queue 
> */
>  #define NICE_WEIGHT 2                        /* priorities per nice level */
> -#define      ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX - SCHED_PPQ)
> +#define      ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX)
>  
>  extern int schedhz;                  /* ideally: 16 */
>  extern int rrticks_init;             /* ticks per roundrobin() */
> +extern struct cpuset sched_idle_cpus;
>  
>  struct proc;
>  void schedclock(struct proc *);
> @@ -147,18 +139,20 @@
>  void cpu_switchto(struct proc *, struct proc *);
>  struct proc *sched_chooseproc(void);
>  struct cpu_info *sched_choosecpu(struct proc *);
> -struct cpu_info *sched_choosecpu_fork(struct proc *parent, int);
> +struct cpu_info *sched_choosecpu_fork(struct proc *parent);
>  void cpu_idle_enter(void);
>  void cpu_idle_cycle(void);
>  void cpu_idle_leave(void);
>  void sched_peg_curproc(struct cpu_info *ci);
>  
> +void generate_deadline(struct proc *, char);
> +
>  #ifdef MULTIPROCESSOR
>  void sched_start_secondary_cpus(void);
>  void sched_stop_secondary_cpus(void);
>  #endif
>  
> -#define curcpu_is_idle()     (curcpu()->ci_schedstate.spc_whichqs == 0)
> +#define curcpu_is_idle() (cpuset_isset(&sched_idle_cpus, curcpu()))
>  
>  void sched_init_runqueues(void);
>  void setrunqueue(struct proc *);
> Index: kern/kern_clock.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_clock.c,v
> retrieving revision 1.72
> diff -u -r1.72 kern_clock.c
> --- kern/kern_clock.c 7 Mar 2011 07:07:13 -0000       1.72
> +++ kern/kern_clock.c 17 Jan 2012 16:10:10 -0000
> @@ -241,7 +241,11 @@
>       if (stathz == 0)
>               statclock(frame);
>  
> -     if (--ci->ci_schedstate.spc_rrticks <= 0)
> +     /*
> +      * If the currently running process went over its round robin tick 
> quota,
> +      * preempt it.
> +      */
> +     if (p && (--(p->p_rrticks) <= 0))
>               roundrobin(ci);
>  
>       /*
> Index: kern/kern_fork.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_fork.c,v
> retrieving revision 1.133
> diff -u -r1.133 kern_fork.c
> --- kern/kern_fork.c  14 Dec 2011 07:32:16 -0000      1.133
> +++ kern/kern_fork.c  17 Jan 2012 16:10:10 -0000
> @@ -225,6 +225,9 @@
>               atomic_setbits_int(&pr->ps_flags, PS_CONTROLT);
>  
>       p->p_p = pr;
> +
> +     /* Pretend we preempted this new process */
> +     generate_deadline(p, 0);
>  }
>  
>  /* print the 'table full' message once per 10 seconds */
> @@ -485,7 +488,7 @@
>       getmicrotime(&p->p_stats->p_start);
>       p->p_acflag = AFORK;
>       p->p_stat = SRUN;
> -     p->p_cpu = sched_choosecpu_fork(curp, flags);
> +     p->p_cpu = sched_choosecpu_fork(curp);
>       setrunqueue(p);
>       SCHED_UNLOCK(s);
>  
> Index: kern/kern_proc.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_proc.c,v
> retrieving revision 1.47
> diff -u -r1.47 kern_proc.c
> --- kern/kern_proc.c  18 Sep 2011 23:20:54 -0000      1.47
> +++ kern/kern_proc.c  17 Jan 2012 16:10:10 -0000
> @@ -398,8 +398,6 @@
>           p->p_comm, p->p_pid, pst, p->p_flag, P_BITS);
>       (*pr)("    pri=%u, usrpri=%u, nice=%d\n",
>           p->p_priority, p->p_usrpri, p->p_p->ps_nice);
> -     (*pr)("    forw=%p, list=%p,%p\n",
> -         TAILQ_NEXT(p, p_runq), p->p_list.le_next, p->p_list.le_prev);
>       (*pr)("    process=%p user=%p, vmspace=%p\n",
>           p->p_p, p->p_addr, p->p_vmspace);
>       (*pr)("    estcpu=%u, cpticks=%d, pctcpu=%u.%u, swtime=%u\n",
> Index: kern/kern_sched.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_sched.c,v
> retrieving revision 1.24
> diff -u -r1.24 kern_sched.c
> --- kern/kern_sched.c 12 Oct 2011 18:30:09 -0000      1.24
> +++ kern/kern_sched.c 17 Jan 2012 16:10:10 -0000
> @@ -19,6 +19,7 @@
>  
>  #include <sys/sched.h>
>  #include <sys/proc.h>
> +#include <sys/tree.h>
>  #include <sys/kthread.h>
>  #include <sys/systm.h>
>  #include <sys/resourcevar.h>
> @@ -32,9 +33,11 @@
>  
>  void sched_kthreads_create(void *);
>  
> -int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
>  struct proc *sched_steal_proc(struct cpu_info *);
>  
> +RB_HEAD(runqueue, proc) sched_runqueue;
> +RB_HEAD(expqueue, proc) sched_expqueue;
> +
>  /*
>   * To help choosing which cpu should run which process we keep track
>   * of cpus which are currently idle and which cpus have processes
> @@ -61,6 +64,27 @@
>   * interrupts during the context switch.
>   */
> 
> +static int
> +sched_cmp_proc(struct proc *a, struct proc *b) {
> +     if (a == b)
> +             return 0;
> +     if (timercmp(&(a->p_deadline), &(b->p_deadline), <))
> +             return -1;
> +     return 1;
> +}

Comment on this code is included in the function below.

> +
> +static int
> +sched_cmp_proc_exp(struct proc *a, struct proc *b) {
> +     if (a == b)
> +             return 0;
> +     if (timercmp(&(a->p_deadline_set), &(b->p_deadline_set), <))
> +             return -1;
> +     return 1;
> +}

The above comparison will fail if:
a != b, but a->p_deadline_set == b->p_deadline_set.
Your code hides this from view, because of the a==b test before it,
turning it into a rare occurence. However when it hits during insert or
remove, you may find your tree getting corrupted and your invariants
getting violated.

What you want is a multi-set semantic, like this:

static int
sched_cmp_proc_exp(struct proc *a, struct proc *b)
{
        int     cmp;

        if (timercmp(&(a->p_deadline_set), &(b->p_deadlint_set), <))
                cmp = -1;
        else
                cmp = timercmp(&(a->p_deadline_set), &(b->p_deadlint_set), <);

        /* Multiset behaviour: sort via identity. */
        if (cmp == 0)
                cmp = (a < b ? -1 : a > b);
        return cmp;
}

Note: the above breaks RB_FIND and RB_NFIND, since a search clause
can never compare equal.

> +
> +RB_GENERATE_STATIC(runqueue, proc, p_runq, sched_cmp_proc);
> +RB_GENERATE_STATIC(expqueue, proc, p_expq, sched_cmp_proc_exp);

That's gonna add a lot of code.

Btw, even though the implementation works fine without it, I'd really
like if you could put matching RB_PROTOTYPE_STATIC before the generator
macro. Using GENERATE without PROTOTYPE is abuse of an implementation
detail.

> +
>  /*
>   * sched_init_cpu is called from main() for the boot cpu, then it's the
>   * responsibility of the MD code to call it for all other cpus.
> @@ -69,10 +93,6 @@
>  sched_init_cpu(struct cpu_info *ci)
>  {
>       struct schedstate_percpu *spc = &ci->ci_schedstate;
> -     int i;
> -
> -     for (i = 0; i < SCHED_NQS; i++)
> -             TAILQ_INIT(&spc->spc_qs[i]);
>  
>       spc->spc_idleproc = NULL;
>  
> @@ -106,6 +126,7 @@
>  {
>       struct schedstate_percpu *spc;
>       struct proc *p = curproc;
> +     struct proc *dead;
>       struct cpu_info *ci = v;
>       int s;
>  
> @@ -120,7 +141,6 @@
>       SCHED_LOCK(s);
>       cpuset_add(&sched_idle_cpus, ci);
>       p->p_stat = SSLEEP;
> -     p->p_cpu = ci;
>       atomic_setbits_int(&p->p_flag, P_CPUPEG);
>       mi_switch();
>       cpuset_del(&sched_idle_cpus, ci);
> @@ -130,38 +150,27 @@
>       KASSERT(curproc == spc->spc_idleproc);
>  
>       while (1) {
> -             while (!curcpu_is_idle()) {
> -                     struct proc *dead;
> -
> -                     SCHED_LOCK(s);
> -                     p->p_stat = SSLEEP;
> -                     mi_switch();
> -                     SCHED_UNLOCK(s);
> -
> -                     while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
> +             while ((dead = LIST_FIRST(&spc->spc_deadproc))) {
>                               LIST_REMOVE(dead, p_hash);
>                               exit2(dead);
> -                     }
>               }
> +             if ((spc->spc_schedflags & SPCF_SHOULDHALT) && 
> !(spc->spc_schedflags & SPCF_HALTED))
> +                     atomic_setbits_int(&spc->spc_schedflags, SPCF_HALTED);
>  
>               splassert(IPL_NONE);
>  
>               cpuset_add(&sched_idle_cpus, ci);
> +
>               cpu_idle_enter();
> -             while (spc->spc_whichqs == 0) {
> -                     if (spc->spc_schedflags & SPCF_SHOULDHALT &&
> -                         (spc->spc_schedflags & SPCF_HALTED) == 0) {
> -                             cpuset_del(&sched_idle_cpus, ci);
> -                             SCHED_LOCK(s);
> -                             atomic_setbits_int(&spc->spc_schedflags,
> -                                 spc->spc_whichqs ? 0 : SPCF_HALTED);
> -                             SCHED_UNLOCK(s);
> -                             wakeup(spc);
> -                     }
> -                     cpu_idle_cycle();
> -             }
> +             cpu_idle_cycle();
>               cpu_idle_leave();
> +
>               cpuset_del(&sched_idle_cpus, ci);
> +
> +             SCHED_LOCK(s);
> +             p->p_stat = SSLEEP;
> +             mi_switch();
> +             SCHED_UNLOCK(s);
>       }
>  }
>  
> @@ -206,63 +215,35 @@
>  void
>  sched_init_runqueues(void)
>  {
> +     RB_INIT(&sched_runqueue);
> +     RB_INIT(&sched_expqueue);
>  }
>  
>  void
>  setrunqueue(struct proc *p)
>  {
> -     struct schedstate_percpu *spc;
> -     int queue = p->p_priority >> 2;
> -
>       SCHED_ASSERT_LOCKED();
> -     spc = &p->p_cpu->ci_schedstate;
> -     spc->spc_nrun++;
> -
> -     TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
> -     spc->spc_whichqs |= (1 << queue);
> -     cpuset_add(&sched_queued_cpus, p->p_cpu);
> -
> -     if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
> -             cpu_unidle(p->p_cpu);
> +     RB_INSERT(runqueue, &sched_runqueue, p);
>  }
>  
>  void
>  remrunqueue(struct proc *p)
>  {
> -     struct schedstate_percpu *spc;
> -     int queue = p->p_priority >> 2;
> -
>       SCHED_ASSERT_LOCKED();
> -     spc = &p->p_cpu->ci_schedstate;
> -     spc->spc_nrun--;
> -
> -     TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
> -     if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
> -             spc->spc_whichqs &= ~(1 << queue);
> -             if (spc->spc_whichqs == 0)
> -                     cpuset_del(&sched_queued_cpus, p->p_cpu);
> -     }
> +     if (!(RB_REMOVE(runqueue, &sched_runqueue, p)))

Is it ok to call this function for a process not on the run queue?
Your code implies it is. If you don't intend that to happen, put a
KASSERT in.

> +             RB_REMOVE(expqueue, &sched_expqueue, p);

Please check return value. I understand that if a process is on the
runqueue, it must also be on the expqueue?

>  }
>  
>  struct proc *
>  sched_chooseproc(void)
>  {
>       struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> -     struct proc *p;
> -     int queue;
> +     struct proc *p = NULL;
> +     struct timeval now;
>  
>       SCHED_ASSERT_LOCKED();
>  
>       if (spc->spc_schedflags & SPCF_SHOULDHALT) {
> -             if (spc->spc_whichqs) {
> -                     for (queue = 0; queue < SCHED_NQS; queue++) {
> -                             TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
> -                                     remrunqueue(p);
> -                                     p->p_cpu = sched_choosecpu(p);
> -                                     setrunqueue(p);
> -                             }
> -                     }
> -             }
>               p = spc->spc_idleproc;
>               KASSERT(p);
>               KASSERT(p->p_wchan == NULL);
> @@ -270,16 +251,30 @@
>               return (p);
>       }
>  
> -again:
> -     if (spc->spc_whichqs) {
> -             queue = ffs(spc->spc_whichqs) - 1;
> -             p = TAILQ_FIRST(&spc->spc_qs[queue]);
> -             remrunqueue(p);
> -             KASSERT(p->p_stat == SRUN);
> -     } else if ((p = sched_steal_proc(curcpu())) == NULL) {
> -             p = spc->spc_idleproc;
> -             if (p == NULL) {
> -                        int s;
> +     if ((!RB_EMPTY(&sched_runqueue)) || (!RB_EMPTY(&sched_expqueue))) {
> +             /*
> +              * Move expired processes to the expired runqueue where they 
> are sorted by the time they got their
> +              * deadline set instead of the deadline itself.
> +              * */
> +             microuptime(&now);
> +             while(!RB_EMPTY(&sched_runqueue)) {
> +                     p = RB_MIN(runqueue, &sched_runqueue);
> +                     if (!p) break;
> +                     if (!timercmp(&(p->p_deadline), &now, <)) break;
> +                     RB_INSERT(expqueue, &sched_expqueue, p);
> +                     RB_REMOVE(runqueue, &sched_runqueue, p);
> +             }
> +
> +             if (!RB_EMPTY(&sched_expqueue)) {
> +                     p = RB_MIN(expqueue, &sched_expqueue);
> +             } else {
> +                     p = RB_MIN(runqueue, &sched_runqueue);
> +             }

Brackets are not required here.

> +             if (p)
> +                     remrunqueue(p);
> +     } else {
> +             while ((p = spc->spc_idleproc) == NULL) {
> +                     int s;
>                       /*
>                        * We get here if someone decides to switch during
>                        * boot before forking kthreads, bleh.
> @@ -291,13 +286,15 @@
>                       spl0();
>                       delay(10);
>                       SCHED_LOCK(s);
> -                     goto again;
> -                }
> +             }
>               KASSERT(p);
> +     }
> +
> +     if (p) {
>               p->p_stat = SRUN;
> +             p->p_cpu = curcpu();
>       } 
>  
> -     KASSERT(p->p_wchan == NULL);
>       return (p);     
>  }
>  
> @@ -310,30 +307,13 @@
>  uint64_t sched_nomigrations;
>  
>  struct cpu_info *
> -sched_choosecpu_fork(struct proc *parent, int flags)
> +sched_choosecpu_fork(struct proc *parent)
>  {
>       struct cpu_info *choice = NULL;
>       fixpt_t load, best_load = ~0;
> -     int run, best_run = INT_MAX;
>       struct cpu_info *ci;
>       struct cpuset set;
>  
> -#if 0
> -     /*
> -      * XXX
> -      * Don't do this until we have a painless way to move the cpu in exec.
> -      * Preferably when nuking the old pmap and getting a new one on a
> -      * new cpu.
> -      */
> -     /*
> -      * PPWAIT forks are simple. We know that the parent will not
> -      * run until we exec and choose another cpu, so we just steal its
> -      * cpu.
> -      */
> -     if (flags & FORK_PPWAIT)
> -             return (parent->p_cpu);
> -#endif
> -
>       /*
>        * Look at all cpus that are currently idle and have nothing queued.
>        * If there are none, pick the one with least queued procs first,
> @@ -347,13 +327,10 @@
>               cpuset_del(&set, ci);
>  
>               load = ci->ci_schedstate.spc_ldavg;
> -             run = ci->ci_schedstate.spc_nrun;
>  
> -             if (choice == NULL || run < best_run ||
> -                 (run == best_run &&load < best_load)) {
> +             if (choice == NULL || load < best_load) {
>                       choice = ci;
>                       best_load = load;
> -                     best_run = run;
>               }
>       }
>  
> @@ -364,9 +341,6 @@
>  sched_choosecpu(struct proc *p)
>  {
>       struct cpu_info *choice = NULL;
> -     int last_cost = INT_MAX;
> -     struct cpu_info *ci;
> -     struct cpuset set;
>  
>       /*
>        * If pegged to a cpu, don't allow it to move.
> @@ -374,169 +348,19 @@
>       if (p->p_flag & P_CPUPEG)
>               return (p->p_cpu);
>  
> -     sched_choose++;
> -
> -     /*
> -      * Look at all cpus that are currently idle and have nothing queued.
> -      * If there are none, pick the cheapest of those.
> -      * (idle + queued could mean that the cpu is handling an interrupt
> -      * at this moment and haven't had time to leave idle yet).
> -      */
> -     cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus);
> -
>       /*
> -      * First, just check if our current cpu is in that set, if it is,
> -      * this is simple.
> -      * Also, our cpu might not be idle, but if it's the current cpu
> -      * and it has nothing else queued and we're curproc, take it.
> +      * Else, just pretend we forked a new process.
>        */
> -     if (cpuset_isset(&set, p->p_cpu) ||
> -         (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 &&
> -         curproc == p)) {
> -             sched_wasidle++;
> -             return (p->p_cpu);
> -     }
> -
> -     if (cpuset_first(&set) == NULL)
> -             cpuset_copy(&set, &sched_all_cpus);
> -
> -     while ((ci = cpuset_first(&set)) != NULL) {
> -             int cost = sched_proc_to_cpu_cost(ci, p);
> +     sched_choose++;
>  
> -             if (choice == NULL || cost < last_cost) {
> -                     choice = ci;
> -                     last_cost = cost;
> -             }
> -             cpuset_del(&set, ci);
> -     }
> +     choice = sched_choosecpu_fork(p);
>  
>       if (p->p_cpu != choice)
>               sched_nmigrations++;
>       else
>               sched_nomigrations++;
>  
> -     return (choice);
> -}
> -
> -/*
> - * Attempt to steal a proc from some cpu.
> - */
> -struct proc *
> -sched_steal_proc(struct cpu_info *self)
> -{
> -     struct schedstate_percpu *spc;
> -     struct proc *best = NULL;
> -     int bestcost = INT_MAX;
> -     struct cpu_info *ci;
> -     struct cpuset set;
> -
> -     cpuset_copy(&set, &sched_queued_cpus);
> -
> -     while ((ci = cpuset_first(&set)) != NULL) {
> -             struct proc *p;
> -             int queue;
> -             int cost;
> -
> -             cpuset_del(&set, ci);
> -
> -             spc = &ci->ci_schedstate;
> -
> -             queue = ffs(spc->spc_whichqs) - 1;
> -             TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) {
> -                     if (p->p_flag & P_CPUPEG)
> -                             continue;
> -
> -                     cost = sched_proc_to_cpu_cost(self, p);
> -
> -                     if (best == NULL || cost < bestcost) {
> -                             best = p;
> -                             bestcost = cost;
> -                     }
> -             }
> -     }
> -     if (best == NULL)
> -             return (NULL);
> -
> -     spc = &best->p_cpu->ci_schedstate;
> -     remrunqueue(best);
> -     best->p_cpu = self;
> -
> -     sched_stolen++;
> -
> -     return (best);
> -}
> -
> -/*
> - * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know).
> - */
> -static int
> -log2(unsigned int i)
> -{
> -     int ret = 0;
> -
> -     while (i >>= 1)
> -             ret++;
> -
> -     return (ret);
> -}
> -
> -/*
> - * Calculate the cost of moving the proc to this cpu.
> - * 
> - * What we want is some guesstimate of how much "performance" it will
> - * cost us to move the proc here. Not just for caches and TLBs and NUMA
> - * memory, but also for the proc itself. A highly loaded cpu might not
> - * be the best candidate for this proc since it won't get run.
> - *
> - * Just total guesstimates for now.
> - */
> -
> -int sched_cost_load = 1;
> -int sched_cost_priority = 1;
> -int sched_cost_runnable = 3;
> -int sched_cost_resident = 1;
> -
> -int
> -sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p)
> -{
> -     struct schedstate_percpu *spc;
> -     int l2resident = 0;
> -     int cost;
> -
> -     spc = &ci->ci_schedstate;
> -
> -     cost = 0;
> -
> -     /*
> -      * First, account for the priority of the proc we want to move.
> -      * More willing to move, the lower the priority of the destination
> -      * and the higher the priority of the proc.
> -      */
> -     if (!cpuset_isset(&sched_idle_cpus, ci)) {
> -             cost += (p->p_priority - spc->spc_curpriority) *
> -                 sched_cost_priority;
> -             cost += sched_cost_runnable;
> -     }
> -     if (cpuset_isset(&sched_queued_cpus, ci)) {
> -             cost += spc->spc_nrun * sched_cost_runnable;
> -     }
> -
> -     /*
> -      * Higher load on the destination means we don't want to go there.
> -      */
> -     cost += ((sched_cost_load * spc->spc_ldavg) >> FSHIFT);
> -
> -     /*
> -      * If the proc is on this cpu already, lower the cost by how much
> -      * it has been running and an estimate of its footprint.
> -      */
> -     if (p->p_cpu == ci && p->p_slptime == 0) {
> -             l2resident =
> -                 log2(pmap_resident_count(p->p_vmspace->vm_map.pmap));
> -             cost -= l2resident * sched_cost_resident;
> -     }
> -
> -     return (cost);
> +     return choice;
>  }
>  
>  /*
> @@ -560,7 +384,6 @@
>  }
>  
>  #ifdef MULTIPROCESSOR
> -
>  void
>  sched_start_secondary_cpus(void)
>  {
> @@ -583,6 +406,9 @@
>  {
>       CPU_INFO_ITERATOR cii;
>       struct cpu_info *ci;
> +     int s;
> +
> +     SCHED_LOCK(s);
>  
>       /*
>        * Make sure we stop the secondary CPUs.
> @@ -601,14 +427,17 @@
>  
>               if (CPU_IS_PRIMARY(ci))
>                       continue;
> -             while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
> +
> +             SCHED_UNLOCK(s);
> +             while (!(spc->spc_schedflags & SPCF_HALTED)) {
>                       sleep_setup(&sls, spc, PZERO, "schedstate");
> -                     sleep_finish(&sls,
> -                         (spc->spc_schedflags & SPCF_HALTED) == 0);
> +                     sleep_setup_timeout(&sls, 10);
> +                     sleep_finish(&sls, 1);
>               }
> +             SCHED_LOCK(s);
>       }
> +     SCHED_UNLOCK(s);
>  }
> -
>  #endif
>  
>  /*
> Index: kern/kern_synch.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_synch.c,v
> retrieving revision 1.99
> diff -u -r1.99 kern_synch.c
> --- kern/kern_synch.c 17 Jan 2012 02:34:18 -0000      1.99
> +++ kern/kern_synch.c 17 Jan 2012 16:10:10 -0000
> @@ -205,7 +205,7 @@
>       p->p_wmesg = wmesg;
>       p->p_slptime = 0;
>       p->p_priority = prio & PRIMASK;
> -     TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_runq);
> +     TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_slpq);
>  }
>  
>  void
> @@ -342,7 +342,7 @@
>  unsleep(struct proc *p)
>  {
>       if (p->p_wchan) {
> -             TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_runq);
> +             TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_slpq);
>               p->p_wchan = NULL;
>       }
>  }
> @@ -361,7 +361,7 @@
>       SCHED_LOCK(s);
>       qp = &slpque[LOOKUP(ident)];
>       for (p = TAILQ_FIRST(qp); p != NULL && n != 0; p = pnext) {
> -             pnext = TAILQ_NEXT(p, p_runq);
> +             pnext = TAILQ_NEXT(p, p_slpq);
>  #ifdef DIAGNOSTIC
>               if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
>                       panic("wakeup: p_stat is %d", (int)p->p_stat);
> @@ -369,7 +369,7 @@
>               if (p->p_wchan == ident) {
>                       --n;
>                       p->p_wchan = 0;
> -                     TAILQ_REMOVE(qp, p, p_runq);
> +                     TAILQ_REMOVE(qp, p, p_slpq);
>                       if (p->p_stat == SSLEEP) {
>                               /* OPTIMIZED EXPANSION OF setrunnable(p); */
>                               if (p->p_slptime > 1)
> Index: kern/sched_bsd.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/sched_bsd.c,v
> retrieving revision 1.27
> diff -u -r1.27 sched_bsd.c
> --- kern/sched_bsd.c  7 Jul 2011 18:00:33 -0000       1.27
> +++ kern/sched_bsd.c  17 Jan 2012 16:10:10 -0000
> @@ -77,37 +77,23 @@
>  
>       timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
>  
> -     rrticks_init = hz / 10;
> +     rrticks_init = hz / 50;
>       schedcpu(&schedcpu_to);
>  }
>  
>  /*
> - * Force switch among equal priority processes every 100ms.
> + * Force switch among equal priority processes every 20ms.
>   */
>  void
>  roundrobin(struct cpu_info *ci)
>  {
>       struct schedstate_percpu *spc = &ci->ci_schedstate;
>  
> -     spc->spc_rrticks = rrticks_init;
> -
>       if (ci->ci_curproc != NULL) {
> -             if (spc->spc_schedflags & SPCF_SEENRR) {
> -                     /*
> -                      * The process has already been through a roundrobin
> -                      * without switching and may be hogging the CPU.
> -                      * Indicate that the process should yield.
> -                      */
> -                     atomic_setbits_int(&spc->spc_schedflags,
> -                         SPCF_SHOULDYIELD);
> -             } else {
> -                     atomic_setbits_int(&spc->spc_schedflags,
> -                         SPCF_SEENRR);
> -             }
> +             atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
>       }
>  
> -     if (spc->spc_nrun)
> -             need_resched(ci);
> +     need_resched(ci);
>  }
>  
>  /*
> @@ -204,7 +190,6 @@
>       struct timeout *to = (struct timeout *)arg;
>       fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
>       struct proc *p;
> -     int s;
>       unsigned int newcpu;
>       int phz;
>  
> @@ -233,7 +218,6 @@
>                */
>               if (p->p_slptime > 1)
>                       continue;
> -             SCHED_LOCK(s);
>               /*
>                * p_pctcpu is only for ps.
>                */
> @@ -250,17 +234,6 @@
>               newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
>               p->p_estcpu = newcpu;
>               resetpriority(p);
> -             if (p->p_priority >= PUSER) {
> -                     if (p->p_stat == SRUN &&
> -                         (p->p_priority / SCHED_PPQ) !=
> -                         (p->p_usrpri / SCHED_PPQ)) {
> -                             remrunqueue(p);
> -                             p->p_priority = p->p_usrpri;
> -                             setrunqueue(p);
> -                     } else
> -                             p->p_priority = p->p_usrpri;
> -             }
> -             SCHED_UNLOCK(s);
>       }
>       uvm_meter();
>       wakeup(&lbolt);
> @@ -278,8 +251,6 @@
>       unsigned int newcpu = p->p_estcpu;
>       fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
>  
> -     SCHED_ASSERT_LOCKED();
> -
>       if (p->p_slptime > 5 * loadfac)
>               p->p_estcpu = 0;
>       else {
> @@ -304,6 +275,7 @@
>       SCHED_LOCK(s);
>       p->p_priority = p->p_usrpri;
>       p->p_stat = SRUN;
> +     generate_deadline(p, 1);
>       setrunqueue(p);
>       p->p_stats->p_ru.ru_nvcsw++;
>       mi_switch();
> @@ -332,6 +304,7 @@
>       p->p_priority = p->p_usrpri;
>       p->p_stat = SRUN;
>       p->p_cpu = sched_choosecpu(p);
> +     generate_deadline(p, 0);
>       setrunqueue(p);
>       p->p_stats->p_ru.ru_nivcsw++;
>       mi_switch();
> @@ -372,14 +345,7 @@
>        * process was running, and add that to its total so far.
>        */
>       microuptime(&tv);
> -     if (timercmp(&tv, &spc->spc_runtime, <)) {
> -#if 0
> -             printf("uptime is not monotonic! "
> -                 "tv=%lu.%06lu, runtime=%lu.%06lu\n",
> -                 tv.tv_sec, tv.tv_usec, spc->spc_runtime.tv_sec,
> -                 spc->spc_runtime.tv_usec);
> -#endif
> -     } else {
> +     if (timercmp(&tv, &spc->spc_runtime, >=)) {
>               timersub(&tv, &spc->spc_runtime, &tv);
>               timeradd(&p->p_rtime, &tv, &p->p_rtime);
>       }
> @@ -403,7 +369,7 @@
>        * Process is about to yield the CPU; clear the appropriate
>        * scheduling flags.
>        */
> -     atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
> +     atomic_clearbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
>  
>       nextproc = sched_chooseproc();
>  
> @@ -435,9 +401,8 @@
>        * be running on a new CPU now, so don't use the cache'd
>        * schedstate_percpu pointer.
>        */
> -     KASSERT(p->p_cpu == curcpu());
> -
> -     microuptime(&p->p_cpu->ci_schedstate.spc_runtime);
> +     if (p->p_cpu == curcpu())
> +             microuptime(&p->p_cpu->ci_schedstate.spc_runtime);
>  
>  #ifdef MULTIPROCESSOR
>       /*
> @@ -515,30 +480,56 @@
>       }
>       p->p_stat = SRUN;
>       p->p_cpu = sched_choosecpu(p);
> -     setrunqueue(p);
>       if (p->p_slptime > 1)
>               updatepri(p);
> +     setrunqueue(p);
>       p->p_slptime = 0;
>       resched_proc(p, p->p_priority);
>  }
>  
>  /*
>   * Compute the priority of a process when running in user mode.
> - * Arrange to reschedule if the resulting priority is better
> - * than that of the current process.
>   */
>  void
>  resetpriority(struct proc *p)
>  {
> -     unsigned int newpriority;
> -
> -     SCHED_ASSERT_LOCKED();
> +     p->p_usrpri = min(((NZERO + (p->p_p->ps_nice))) << 1, MAXPRI);

Nice levels deserve a look all on their own. The current semantic is
very 4BSD, which was great when processes moved to disk on a regular
basis, but is utterly confusing nowadays.
However, you probably don't want to change too much in a single diff.

> +}
>  
> -     newpriority = PUSER + p->p_estcpu +
> -         NICE_WEIGHT * (p->p_p->ps_nice - NZERO);
> -     newpriority = min(newpriority, MAXPRI);
> -     p->p_usrpri = newpriority;
> -     resched_proc(p, p->p_usrpri);
> +/*
> + * Generate a new virtual deadline for a process. The deadline is a soft
> + * one and has no purpose besides being used for choosing the next process
> + * to run. Also resets the number of round robin ticks available to the
> + * process if the previous timeslice expired and the process had to be 
> preempted.
> + */
> +void
> +generate_deadline(struct proc *p, char voluntary) {
> +     /*
> +      * For nice values between 0 and 39 inclusively, the offset lies between
> +      * 32 and 1280 milliseconds for a machine with hz=100. That means that
> +      * processes with nice value=0 (i.e. -20 in userland) will be executed
> +      * 32 milliseconds in the future at the latest. Processes with very
> +      * little priority will be executed 1.28 seconds in the future at the 
> very
> +      * latest. The shift is done to ensure that the lowest possible offset 
> is
> +      * larger than the timeslice, in order to make sure that the scheduler 
> does
> +      * not degenerate to round robin behaviour when more than just a few 
> processes
> +      * with high priority are started.
> +      *
> +      * If the process voluntarily yielded its CPU, we reward it by halving 
> its
> +      * deadline offset.
> +      */
> +     unsigned int offset_msec = ((p->p_p->ps_nice + 1) * rrticks_init) << 
> (voluntary ? 2 : 3);
> +     struct timeval offset = {
> +             .tv_sec  = offset_msec / 1000,
> +             .tv_usec = offset_msec % 1000

tv_usec is measured in microseconds, msec suggests milliseconds.
I'd change that to:
                .tv_usec = offset_msec % 1000 * 1000

> +     };
> +     struct timeval now;
> +     microuptime(&now);
> +     bcopy(&now, &(p->p_deadline_set), sizeof(struct timeval));
> +
> +     timeradd(&now, &offset, &(p->p_deadline));
> +     if (!voluntary)
> +             p->p_rrticks = rrticks_init;
>  }
>  
>  /*
> @@ -559,12 +550,8 @@
>  void
>  schedclock(struct proc *p)
>  {
> -     int s;
> -
> -     SCHED_LOCK(s);
>       p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
>       resetpriority(p);
>       if (p->p_priority >= PUSER)
>               p->p_priority = p->p_usrpri;
> -     SCHED_UNLOCK(s);
>  }


I really like the deadlines. They convey intent very well.
-- 
Ariane

Reply via email to