scheduler small changes

Amit Kulkarni Wed, 15 May 2019 07:06:25 -0700

Hi,

This effort is heavily based on top of Gregor's and Michal's diffs. Tried to 
incorporate feedback given by different people to them in 2011/2016. Split the 
new code in a ifdef, so people can do a straight comparison, tried very hard 
not to delete existing code, just shifted it around. Main motivation was to 
find if it is possible to do incremental improvements in scheduler. After 
sometime, have still not understood completely the load avg part, 
p_priority/p_usrpri calculations, the cpuset code. Looked heavily at Dragonfly 
(simpler than FreeBSD) and FreeBSD code. As a newcomer, OpenBSD code is way 
easier to read. Thanks for the detailed explanations given to Michal and also 
in later diffs posted to tech@, they helped a lot when trying to understand the 
decisions made by other devs, the commit messages help a little but the 
explanations help a lot more. This invites discussions and end users learn a 
lot more in the process.


This diff survived multiple tens of kernel builds, a bsd.sp build, bsd.rd 
build, a bsd.mp without these changes, userland/xenocara, a make regress a few 
days ago all on 3 desktops on amd64. Ran under all possible scenarios listed in 
previous sentence. No major invasive changes attempted, all tools should work 
as is, if its broken, please let me know. This is faster than current, but not 
sure by how much, placebo effect in play.

I think  there is a bug in resched_proc() which is fixed in mi_switch(), a 
quick enhancement in cpu_switchto(), p_slptime, and precise round robin 
interval for each proc unless preempted or it yields().

Tried to fiddle with different time slices other than hz/10, not sure if it 
will work on other arches, but tried to stay within MI code, so it should work. 
Other than counting no idea to verify a proc is actually switched away after 
its rr interval is over. Just went by what is there in the existing code. Tried 
giving higher slice like 200 ms, but didn't want to exceed the rucheck() in 
kern_resource 1 sec limit.

Tried to do rudimentary work on affinity without having a affinity field in 
struct proc or struct schedstate_percpu (like Dragonfly/FreeBSD does). Observed 
the unbalance in runqs. Affinity works most of the time under light load. 
There's a problem when I try to comment out sched_steal_proc(), in kern_sched, 
that is the problem with this iteration of the diff.

Not sure if the reduction from 32 queues to a single TAILQ would be accepted, 
but tried it anyway, it is definitely faster. This code tries to reduce the 
complexity in deciding which queue to place the proc on. There is no current 
support for a real-time queue or other types of scheduling classes, so IMHO 
this is a simplification.

Tried to give detailed explanation of thinking. Sent the complete diff, but 
will split diff, if parts of it are found to be valid.

In any case, a request to please accept a small change in top, to display 
p_priority directly.

Thanks


diff --git a/sys/conf/GENERIC b/sys/conf/GENERIC
index d6a4fdcb0e2..73995746e23 100644
--- a/sys/conf/GENERIC
+++ b/sys/conf/GENERIC
@@ -3,6 +3,7 @@
 #      Machine-independent option; used by all architectures for their
 #      GENERIC kernel
 
+option         TSTSCHED        # test scheduler
 option         DDB             # in-kernel debugger
 #option                DDBPROF         # ddb(4) based profiling
 #option                DDB_SAFE_CONSOLE # allow break into ddb during boot
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index effefd552c3..be58199d834 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -129,6 +129,10 @@ int        ncpus =  1;
 int    ncpusfound = 1;                 /* number of cpus we find */
 volatile int start_init_exec;          /* semaphore for start_init() */
 
+#ifdef TSTSCHED
+int sched_start_dquantum = 0;
+#endif
+
 #if !defined(NO_PROPOLICE)
 long   __guard_local __attribute__((section(".openbsd.randomdata")));
 #endif
@@ -572,6 +576,21 @@ main(void *framep)
 
        start_periodic_resettodr();
 
+       /* Inited all systems, ready to start the test scheduler now!
+        *
+        * I saw some instability, when rr_intvl not set to hz/10 during
+        * boot, so resorting to fiddling with rr interval after we
+        * have booted up fully.
+        * */
+#ifdef TSTSCHED
+       printf("-----------------------------------------------------\n");
+       printf("-----------------------------------------------------\n");
+       printf("-------------  TEST SCHEDULER starting   -------------\n");
+       printf("-----------------------------------------------------\n");
+       printf("-----------------------------------------------------\n");
+       sched_start_dquantum = 1;
+#endif
+
         /*
          * proc0: nothing to do, back to sleep
          */
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
index ee701945966..8db6611bff3 100644
--- a/sys/kern/kern_clock.c
+++ b/sys/kern/kern_clock.c
@@ -143,7 +143,6 @@ void
 hardclock(struct clockframe *frame)
 {
        struct proc *p;
-       struct cpu_info *ci = curcpu();
 
        p = curproc;
        if (p && ((p->p_flag & (P_SYSTEM | P_WEXIT)) == 0)) {
@@ -171,6 +170,20 @@ hardclock(struct clockframe *frame)
        if (stathz == 0)
                statclock(frame);
 
+#ifdef TSTSCHED
+       check_quantum_expiration();
+
+       /* bsd.sp compile complained of unused ci variable, so
+        * moved it here.*/
+
+       /*
+        * If we are not the primary CPU, we're not allowed to do
+        * any more work.
+        */
+       if (CPU_IS_PRIMARY(curcpu()) == 0)
+               return;
+#else
+       struct cpu_info *ci = curcpu();
        if (--ci->ci_schedstate.spc_rrticks <= 0)
                roundrobin(ci);
 
@@ -180,6 +193,7 @@ hardclock(struct clockframe *frame)
         */
        if (CPU_IS_PRIMARY(ci) == 0)
                return;
+#endif
 
        tc_ticktock();
        ticks++;
@@ -394,6 +408,9 @@ statclock(struct clockframe *frame)
        spc->spc_pscnt = psdiv;
 
        if (p != NULL) {
+#ifdef TSTSCHED
+               p->p_cnsd_slice++;
+#endif
                p->p_cpticks++;
                /*
                 * If no schedclock is provided, call it here at ~~12-25 Hz;
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 578752c75aa..559e2c5ea65 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -152,6 +152,12 @@ thread_new(struct proc *parent, vaddr_t uaddr)
        p->p_stat = SIDL;                       /* protect against others */
        p->p_flag = 0;
 
+       /* default: hz/10 ticks for each thread. */
+#ifdef TSTSCHED
+       p->p_dquantum = low_med_rrticks;
+       KASSERT(p->p_dquantum > 0);
+#endif
+
        /*
         * Make a proc table entry for the new process.
         * Start by zeroing the section of proc that is zero-initialized,
@@ -329,7 +335,12 @@ fork_thread_start(struct proc *p, struct proc *parent, int 
flags)
 
        SCHED_LOCK(s);
        p->p_stat = SRUN;
+       /* mpi diff to tech@ */
+#ifdef TSTSCHED
+       p->p_cpu = sched_choosecpu(parent);
+#else
        p->p_cpu = sched_choosecpu_fork(parent, flags);
+#endif
        setrunqueue(p);
        SCHED_UNLOCK(s);
 }
diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
index 3eec06330ec..63e36e10fd6 100644
--- a/sys/kern/kern_sched.c
+++ b/sys/kern/kern_sched.c
@@ -35,7 +35,7 @@ int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc 
*p);
 struct proc *sched_steal_proc(struct cpu_info *);
 
 /*
- * To help choosing which cpu should run which process we keep track
+ * To help choose which cpu should run which process we keep track
  * of cpus which are currently idle and which cpus have processes
  * queued.
  */
@@ -49,7 +49,9 @@ struct cpuset sched_all_cpus;
 uint64_t sched_nmigrations;    /* Cpu migration counter */
 uint64_t sched_nomigrations;   /* Cpu no migration counter */
 uint64_t sched_noidle;         /* Times we didn't pick the idle task */
+#ifndef TSTSCHED
 uint64_t sched_stolen;         /* Times we stole proc from other cpus */
+#endif
 uint64_t sched_choose;         /* Times we chose a cpu */
 uint64_t sched_wasidle;                /* Times we came out of idle */
 
@@ -59,6 +61,15 @@ struct taskq *sbartq;
 
 int sched_smt;
 
+#ifdef TSTSCHED
+int hi_rrticks_ctr=0;          /* 40 ms rr intvl */
+int med_hi_rrticks_ctr=0;      /* 60 ms rr intvl */
+int med_rrticks_ctr=0;         /* 80 ms rr intvl */
+int low_med_rrticks_ctr=0;     /* 100 ms rr intvl, orig BSD schdlr */
+int low_rrticks_ctr=0;         /* 120 ms rr intvl */
+int sys_rrticks_ctr=0;         /* 200 ms rr intvl */
+#endif
+
 /*
  * A few notes about cpu_switchto that is implemented in MD code.
  *
@@ -84,10 +95,43 @@ void
 sched_init_cpu(struct cpu_info *ci)
 {
        struct schedstate_percpu *spc = &ci->ci_schedstate;
+
+       /*
+        * The number of procs in runqueues are not that large.
+        * In my testing, they are always in single digit percentage
+        * of the total procs. They are processed quite fast, so we need a
+        * simpler data structure compared to 32 queues.
+        *
+        * The queue index is itself based on usrpri, which starts at
+        * PUSER, and was frequently observed to be using high array
+        * indexes. After boot, low array indexes, but afterwards, a
+        * consistently high array index. This defeats the very purpose
+        * of having 32 separate queues with different priorities in mind.
+        * Also, having 32 queues but the current code does not
+        * really deal correctly with p_priority as mpi@ found. It might
+        * be a waste of memory to have 32 queues per CPU.
+        *
+        * The existing code is quite good with just a single TAILQ.
+        *
+        * As Michal observed, they are maximum 4 observed per CPU runq.
+        * I have not yet observed more than 4 per CPU runq even when
+        * tried to put different systems under heavy load. On other
+        * high end boxen runq length might be higher, not sure of exact #.
+        *
+        * At this time, a Red-black tree gave a worse performance as
+        * compared to a simple TAILQ, using Michal's compare func.
+        * In this RB test, I used a simple monotonically increasing
+        * p_deadline counter right after a call to p_cpticks++ in
+        * statclock() in kern_clock.c
+        */
+#ifdef TSTSCHED
+       TAILQ_INIT(&spc->td_runq);
+#else
        int i;
 
        for (i = 0; i < SCHED_NQS; i++)
                TAILQ_INIT(&spc->spc_qs[i]);
+#endif
 
        spc->spc_idleproc = NULL;
 
@@ -178,14 +222,23 @@ sched_idle(void *v)
 
                cpuset_add(&sched_idle_cpus, ci);
                cpu_idle_enter();
+#ifdef TSTSCHED
+               while (TAILQ_EMPTY(&spc->td_runq)) {
+#else
                while (spc->spc_whichqs == 0) {
+#endif
 #ifdef MULTIPROCESSOR
                        if (spc->spc_schedflags & SPCF_SHOULDHALT &&
                            (spc->spc_schedflags & SPCF_HALTED) == 0) {
                                cpuset_del(&sched_idle_cpus, ci);
                                SCHED_LOCK(s);
+#ifdef TSTSCHED
+                               atomic_setbits_int(&spc->spc_schedflags,
+                                   !TAILQ_EMPTY(&spc->td_runq) ? 0 : 
SPCF_HALTED);
+#else
                                atomic_setbits_int(&spc->spc_schedflags,
                                    spc->spc_whichqs ? 0 : SPCF_HALTED);
+#endif
                                SCHED_UNLOCK(s);
                                wakeup(spc);
                        }
@@ -216,7 +269,11 @@ sched_exit(struct proc *p)
        struct proc *idle;
        int s;
 
+#ifdef TSTSCHED
+       getnanouptime(&ts);
+#else
        nanouptime(&ts);
+#endif
        timespecsub(&ts, &spc->spc_runtime, &ts);
        timespecadd(&p->p_rtime, &ts, &p->p_rtime);
 
@@ -247,14 +304,60 @@ void
 setrunqueue(struct proc *p)
 {
        struct schedstate_percpu *spc;
-       int queue = p->p_priority >> 2;
 
        SCHED_ASSERT_LOCKED();
        spc = &p->p_cpu->ci_schedstate;
        spc->spc_nrun++;
 
+#ifdef TSTSCHED
+       /* change the p_dquantum for the entire kernel here */
+       p->p_dquantum = get_rr_dquantum(p);
+
+       /* Verify the counters are working for different
+        * rr intervals. Clang complains expression is not an
+        * integer constant, so hard-code the values verified in
+        * a printf in scheduler_start(). These counters are for
+        * printing how many interval counts we are getting.
+        * This code below needs to be deleted out later.*/
+
+       switch(p->p_dquantum) {
+               case 4:
+                       hi_rrticks_ctr++;
+                       break;
+               case 6:
+                       med_hi_rrticks_ctr++;
+                       break;
+               case 8:
+                       med_rrticks_ctr++;
+                       break;
+               case 10:
+                       low_med_rrticks_ctr++;
+                       break;
+               case 12:
+                       low_rrticks_ctr++;
+                       break;
+               case 20:
+                       sys_rrticks_ctr++;
+                       break;
+               default:
+                       low_med_rrticks_ctr++;
+                       break;
+       }
+       /* FreeBSD does this, high priority proc inserted into
+        * head of queue. they get preference under any condition
+        * when they hit the run queue. Simulating a RBT.
+        */
+       if (p->p_priority <= 10) {
+               TAILQ_INSERT_HEAD(&spc->td_runq, p, p_runq);
+       }
+       else {
+               TAILQ_INSERT_TAIL(&spc->td_runq, p, p_runq);
+       }
+#else
+       int queue = p->p_priority >> 2;
        TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq);
        spc->spc_whichqs |= (1 << queue);
+#endif
        cpuset_add(&sched_queued_cpus, p->p_cpu);
 
        if (cpuset_isset(&sched_idle_cpus, p->p_cpu))
@@ -265,18 +368,25 @@ void
 remrunqueue(struct proc *p)
 {
        struct schedstate_percpu *spc;
-       int queue = p->p_priority >> 2;
 
        SCHED_ASSERT_LOCKED();
        spc = &p->p_cpu->ci_schedstate;
        spc->spc_nrun--;
 
+#ifdef TSTSCHED
+       TAILQ_REMOVE(&spc->td_runq, p, p_runq);
+       if (TAILQ_EMPTY(&spc->td_runq)) {
+               cpuset_del(&sched_queued_cpus, p->p_cpu);
+       }
+#else
+       int queue = p->p_priority >> 2;
        TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq);
        if (TAILQ_EMPTY(&spc->spc_qs[queue])) {
                spc->spc_whichqs &= ~(1 << queue);
                if (spc->spc_whichqs == 0)
                        cpuset_del(&sched_queued_cpus, p->p_cpu);
        }
+#endif
 }
 
 struct proc *
@@ -284,12 +394,40 @@ sched_chooseproc(void)
 {
        struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
        struct proc *p;
+#ifndef TSTSCHED
        int queue;
+#endif
 
        SCHED_ASSERT_LOCKED();
 
 #ifdef MULTIPROCESSOR
        if (spc->spc_schedflags & SPCF_SHOULDHALT) {
+#ifdef TSTSCHED
+               while ((p = TAILQ_FIRST(&spc->td_runq))) {
+                       remrunqueue(p);
+                       /* reboot/shutdown:  move all remaining proc's to
+                        * primary CPU so we can halt all the secondary
+                        * CPUs. */
+                       p->p_cpu = &cpu_info_primary;
+                       setrunqueue(p);
+       /*
+        * XXX: This below code was added previously, but doesn't
+        * work as is below due to removing sched_steal_proc().
+        *
+        * cvs rev 1.43: Allow pegged process on secondary CPUs to
+        * continue to be scheduled when halting a CPU.  Necessary for
+        * intr_barrier(9) to work when interrupts are
+        * targeted at secondary CPUs.
+
+                       p->p_cpu = sched_choosecpu(p);
+                       setrunqueue(p);
+                       if (p->p_cpu == curcpu()) {
+                               KASSERT(p->p_flag & P_CPUPEG);
+                               goto again;
+                       }
+       */
+               }
+#else
                if (spc->spc_whichqs) {
                        for (queue = 0; queue < SCHED_NQS; queue++) {
                                while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) {
@@ -303,6 +441,7 @@ sched_chooseproc(void)
                                }
                        }
                }
+#endif
                p = spc->spc_idleproc;
                KASSERT(p);
                KASSERT(p->p_wchan == NULL);
@@ -312,6 +451,32 @@ sched_chooseproc(void)
 #endif
 
 again:
+#ifdef TSTSCHED
+       if (!TAILQ_EMPTY(&spc->td_runq)) {
+               p = TAILQ_FIRST(&spc->td_runq);
+               remrunqueue(p);
+               sched_noidle++;
+               KASSERT(p->p_stat == SRUN);
+       /* Idle this cpu quickly, don't steal any procs here.
+        * we trust that next call to sched_choosecpu() will select
+        * this now idle cpu for scheduling.
+        *
+        * We try to idle and put this CPU into idle
+        * pool, this will help us in ARM big-little, and
+        * also in saving laptop battery.
+        *
+        * This causes a lot of shifting of proc's between
+        * different cores, ideally if a proc is on a core,
+        * try to keep it there, we have already chosen a CPU before
+        * entering this function, now don't go on trying to trash
+        * other CPUs cache. We trust that the other core will finish
+        * its assigned proc so it is removed from that cpu's run queue.
+        *
+        * Backout if concern raised in cvs rev
+        * 1.43 is not able to be fixed.
+        */
+       } else {
+#else
        if (spc->spc_whichqs) {
                queue = ffs(spc->spc_whichqs) - 1;
                p = TAILQ_FIRST(&spc->spc_qs[queue]);
@@ -319,6 +484,7 @@ again:
                sched_noidle++;
                KASSERT(p->p_stat == SRUN);
        } else if ((p = sched_steal_proc(curcpu())) == NULL) {
+#endif
                p = spc->spc_idleproc;
                if (p == NULL) {
                         int s;
@@ -337,12 +503,13 @@ again:
                 }
                KASSERT(p);
                p->p_stat = SRUN;
-       } 
+       }
 
        KASSERT(p->p_wchan == NULL);
-       return (p);     
+       return (p);
 }
 
+#ifndef TSTSCHED
 struct cpu_info *
 sched_choosecpu_fork(struct proc *parent, int flags)
 {
@@ -398,6 +565,7 @@ sched_choosecpu_fork(struct proc *parent, int flags)
        return (curcpu());
 #endif
 }
+#endif
 
 struct cpu_info *
 sched_choosecpu(struct proc *p)
@@ -416,6 +584,31 @@ sched_choosecpu(struct proc *p)
 
        sched_choose++;
 
+#ifdef TSTSCHED
+       /* during boot, the disk callbacks are not completed,
+        * current code relies on sched_steal_proc(), but in
+        * an effort to avoid CPU ping-pong, we have
+        * removed sched_steal_proc(). So instead,
+        * we lock all initial processing to boot CPU, and
+        * later after kernel has fully inited we release all
+        * CPUs loose.
+        */
+       
+       if (sched_start_dquantum == 0) {
+               return &cpu_info_primary;
+       }
+
+       /* No other proc on curproc's td_runq, set soft affinity
+        * and return quick, we potentially save the CPU cache.
+        * Kind of like a soft-cpupeg.
+        */
+
+       if (p->p_cpu->ci_schedstate.spc_nrun == 0) {
+               sched_nomigrations++;
+               return (p->p_cpu);
+       }
+#endif
+
        /*
         * Look at all cpus that are currently idle and have nothing queued.
         * If there are none, pick the cheapest of those.
@@ -463,6 +656,7 @@ sched_choosecpu(struct proc *p)
 #endif
 }
 
+#ifndef TSTSCHED
 /*
  * Attempt to steal a proc from some cpu.
  */
@@ -517,6 +711,7 @@ sched_steal_proc(struct cpu_info *self)
 #endif
        return (best);
 }
+#endif /* TSTSCHED */
 
 #ifdef MULTIPROCESSOR
 /*
@@ -727,6 +922,9 @@ sched_barrier(struct cpu_info *ci)
 
 /*
  * Functions to manipulate cpu sets.
+ *
+ * XXX: These below will work when ncpus > MAXCPU?
+ * array index outside bounds?
  */
 struct cpu_info *cpuset_infos[MAXCPUS];
 static struct cpuset cpuset_all;
@@ -853,6 +1051,28 @@ sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, 
size_t newlen)
        struct cpu_info *ci;
        int err, newsmt;
 
+/*
+ * didn't want to print debug info all the time, hijack this for now,
+ * so we can print when we want.
+ */
+#ifdef TSTSCHED
+       int cpu_index = 0;
+       CPU_INFO_FOREACH(cii, ci) {
+               struct schedstate_percpu *spc = &ci->ci_schedstate;
+               printf("CPU %d --> %d\t", cpu_index, spc->spc_nrun);
+               cpu_index++;
+       }
+       printf("\n\n\n\n\n");
+       printf("null_count ---> %d\n", curproc_null_count);
+       printf("hi_rrticks_ctr ---> %d\n", hi_rrticks_ctr);
+       printf("med_hi_rrticks_ctr ---> %d\n", med_hi_rrticks_ctr);
+       printf("med_rrticks_ctr ---> %d\n", med_rrticks_ctr);
+       printf("low_med_rrticks_ctr ---> %d\n", low_med_rrticks_ctr);
+       printf("low_rrticks_ctr ---> %d\n", low_rrticks_ctr);
+       printf("sys_rrticks_ctr ---> %d\n", sys_rrticks_ctr);
+       printf("affnty_cnt ---> %d\n", affnty_cnt);
+#endif
+
        newsmt = sched_smt;
        err = sysctl_int(oldp, oldlenp, newp, newlen, &newsmt);
        if (err)
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index 2ec52e662a5..1e99f16811c 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -443,7 +443,7 @@ wakeup_n(const volatile void *ident, int n)
                 * and sleep_finish().
                 */
                if (p == curproc) {
-                       KASSERT(p->p_stat == SONPROC);
+                       KASSERT(p->p_stat == SONPROC || p->p_stat == SRUN);
                        continue;
                }
                if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
index 00a08861b59..3c6ba977d36 100644
--- a/sys/kern/sched_bsd.c
+++ b/sys/kern/sched_bsd.c
@@ -55,7 +55,6 @@
 
 
 int    lbolt;                  /* once a second sleep address */
-int    rrticks_init;           /* # of hardclock ticks per roundrobin() */
 
 #ifdef MULTIPROCESSOR
 struct __mp_lock sched_lock;
@@ -64,6 +63,22 @@ struct __mp_lock sched_lock;
 void    schedcpu(void *);
 void    updatepri(struct proc *);
 
+#ifdef TSTSCHED
+int curproc_null_count;
+
+int hi_rrticks;
+int med_hi_rrticks;
+int med_rrticks;
+int low_med_rrticks;
+int low_rrticks;
+int sys_rrticks;
+
+int affnty_cnt;
+#else
+int    rrticks_init;           /* # of hardclock ticks per roundrobin() */
+#endif
+
+
 void
 scheduler_start(void)
 {
@@ -77,12 +92,127 @@ scheduler_start(void)
         */
        timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
 
+#ifdef TSTSCHED
+       hi_rrticks = (int) hz/24;               /* 40 ms rr intvl */
+       med_hi_rrticks = (int) hz/16;           /* 60 ms rr intvl */
+       med_rrticks = (int) hz/12;              /* 80 ms rr intvl */
+       low_med_rrticks = (int) hz/10;          /* 100 ms rr intvl, orig BSD 
schdlr */
+       low_rrticks = (int) hz/8;               /* 120 ms rr intvl */
+       sys_rrticks = (int) hz/5;               /* 200 ms rr intvl */
+
+       curproc_null_count = 0;
+       affnty_cnt = 0;
+
+printf("hi_rrticks ---> %d\n", hi_rrticks);
+printf("med_hi_rrticks ---> %d\n", med_hi_rrticks);
+printf("med_rrticks ---> %d\n", med_rrticks);
+printf("low_med_rrticks ---> %d\n", low_med_rrticks);
+printf("low_rrticks ---> %d\n", low_rrticks);
+printf("sys_rrticks ---> %d\n", sys_rrticks);
+#else
        rrticks_init = hz / 10;
+#endif
+
        schedcpu(&schedcpu_to);
 }
 
+#ifdef TSTSCHED
+/*
+ * Is the nice calculation for sndiod busted?
+ * the p_priority stays around 24 for sndiod???
+ *
+ */
+int get_rr_dquantum(struct proc *p) {
+       if (p == NULL || sched_start_dquantum == 0)
+               return low_med_rrticks;
+
+       /* p guaranteed to be non-null now, so assert right away */
+       KASSERT(p != NULL);
+
+       /* softclock, i915/DRM can be finished quite fast.*/
+       if (p->p_priority == 0) {
+               return hi_rrticks;
+       }
+       /* If a P_SYSTEM proc comes, it might need more time */
+       else if (p->p_priority > 0 && p->p_priority < 10) {
+               return sys_rrticks;
+       }
+       else if (p->p_priority >= 10 && p->p_priority < 20) {
+               return med_hi_rrticks;
+       }
+       else if (p->p_priority >= 20 && p->p_priority < 30) {
+               return med_rrticks;
+       }
+       else if (p->p_priority >= 30 && p->p_priority < 50) {
+               return low_med_rrticks;
+       }
+       /* browsers, libreoffice, compiler need more time */
+       else if (p->p_priority >= 50) {
+               return low_rrticks;
+       }
+
+       return low_med_rrticks;
+}
+
+/*
+ * Force switch among threads every assigned time quantum.
+ *
+ * The reason for changing this functionality is that, regardless
+ * of the time the curproc started, it is kicked off CPU every 100 ms.
+ * curproc might have just started 10 ms ago, and it is kicked off.
+ * This is inaccurate, if curproc can run, it should run for its
+ * full assigned quantum. Just experimenting with different time slices
+ * here, and to see if we can assign different time slices to different
+ * p_priority proc's.
+ */
+void check_quantum_expiration() {
+       int s;
+       struct proc* p = curproc;
+       if ((p = curproc) == NULL) {
+               /* Just used during initial development to understand how
+                * many times, we are getting nulls in curproc. Only once
+                * during boot on 1 machine, on two others 2-3 times only.
+                *
+                * We need to verify that we are not getting a lot of nulls
+                * in this func, otherwise, our scheduling will not
+                * be precise. We may need to move away and use something
+                * better.
+                * */
+               curproc_null_count++;
+               return;
+       }
+
+       if (p->p_cnsd_slice >= p->p_dquantum) {
+               struct schedstate_percpu *spc = &p->p_cpu->ci_schedstate;
+               SCHED_LOCK(s);
+               /* reset p_cnsd_slice immediately */
+               p->p_cnsd_slice = 0;
+
+               /*
+                * The process has already been through a roundrobin
+                * without switching, time for it to stop using the CPU.
+                */
+               atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
+               SCHED_UNLOCK(s);
+
+               /* need_resched() does actual yield() as it kicks off a bunch
+                * of IPIs in machdep.c in cpu_kick().
+                *
+                * Moved inside here so it fires only when we want to
+                * switch proc's, and does it unconditionally.
+                *
+                * Does it make sense to fill p_cnsd_slice, and
+                * allow current proc to continue, if no queued proc on
+                * its per CPU queue?
+                */
+
+               /* if (spc->spc_nrun) */
+                       need_resched(p->p_cpu);
+       }
+}
+#else
 /*
- * Force switch among equal priority processes every 100ms.
+ * Force switch among threads every 100ms.
  */
 void
 roundrobin(struct cpu_info *ci)
@@ -109,6 +239,7 @@ roundrobin(struct cpu_info *ci)
        if (spc->spc_nrun)
                need_resched(ci);
 }
+#endif
 
 /*
  * Constants for digital decay and forget:
@@ -117,7 +248,7 @@ roundrobin(struct cpu_info *ci)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
- * Note that hardclock updates p_estcpu and p_cpticks independently.
+ * Note that hardclock updates p_cpticks independently.
  *
  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
@@ -196,7 +327,7 @@ fixpt_t     ccpu = 0.95122942450071400909 * FSCALE;         
/* exp(-1/20) */
 #define        CCPU_SHIFT      11
 
 /*
- * Recompute process priorities, every second.
+ * Recompute thread priorities, every second.
  */
 void
 schedcpu(void *arg)
@@ -218,6 +349,8 @@ schedcpu(void *arg)
        KASSERT(phz);
 
        LIST_FOREACH(p, &allproc, p_list) {
+               /* KASSERT(p->p_cpu != NULL); ??? */
+
                /*
                 * Idle threads are never placed on the runqueue,
                 * therefore computing their priority is pointless.
@@ -228,8 +361,16 @@ schedcpu(void *arg)
                /*
                 * Increment sleep time (if sleeping). We ignore overflow.
                 */
+#ifdef TSTSCHED
+               /* Increment only when thread slept or was stopped for the
+                * entire second!! buglet? */
+               if ((p->p_stat == SSLEEP || p->p_stat == SSTOP) 
+                               && p->p_cpticks == 0)
+                       p->p_slptime++;
+#else
                if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
                        p->p_slptime++;
+#endif
                p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
                /*
                 * If the process has slept the entire second,
@@ -254,6 +395,27 @@ schedcpu(void *arg)
                newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
                p->p_estcpu = newcpu;
                resetpriority(p);
+#ifdef TSTSCHED
+               /* if proc priority is too low (i.e. high p_priority),
+                * unconditionally change it here for all procs.
+                * that's what the current code does.
+                *
+                * thinking of removing the remrunq/setrunq dance
+                * below due to a TOCTOU race, if a proc is not in runq
+                * there is no effect, but then we again add to runq.
+                * Should we just change p_priority and then leave
+                * the process be till its next time in runq?
+                * this loop needs to be short.
+                */
+               if (p->p_priority >= PUSER) {
+                       p->p_priority = p->p_usrpri;
+                       if (p->p_stat == SRUN &&
+                           (p->p_priority != p->p_usrpri)) {
+                               remrunqueue(p);
+                               setrunqueue(p);
+                       }
+               }
+#else
                if (p->p_priority >= PUSER) {
                        if (p->p_stat == SRUN &&
                            (p->p_priority / SCHED_PPQ) !=
@@ -264,6 +426,7 @@ schedcpu(void *arg)
                        } else
                                p->p_priority = p->p_usrpri;
                }
+#endif
                SCHED_UNLOCK(s);
        }
        uvm_meter();
@@ -318,9 +481,7 @@ yield(void)
 
 /*
  * General preemption call.  Puts the current process back on its run queue
- * and performs an involuntary context switch.  If a process is supplied,
- * we switch to that process.  Otherwise, we use the normal process selection
- * criteria.
+ * and performs an involuntary context switch.
  */
 void
 preempt(void)
@@ -370,7 +531,11 @@ mi_switch(void)
         * Compute the amount of time during which the current
         * process was running, and add that to its total so far.
         */
+#ifdef TSTSCHED
+       getnanouptime(&ts);
+#else
        nanouptime(&ts);
+#endif
        if (timespeccmp(&ts, &spc->spc_runtime, <)) {
 #if 0
                printf("uptime is not monotonic! "
@@ -391,17 +556,65 @@ mi_switch(void)
         * Process is about to yield the CPU; clear the appropriate
         * scheduling flags.
         */
+#ifdef TSTSCHED
+       atomic_clearbits_int(&spc->spc_schedflags, SPCF_SHOULDYIELD);
+#else
        atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
+#endif
 
        nextproc = sched_chooseproc();
 
+#ifdef TSTSCHED
+       if (p != nextproc)
+               uvmexp.swtch++;
+       else
+               affnty_cnt++;
+       
+       /* printf("sched_bsd: cpu_switchto() from %s to %s \n",
+               p->p_p->ps_comm, nextproc->p_p->ps_comm); */
+
+/* Bug: We are comparing spc_curpriority to decide in resched_proc()
+ * if we need_resched(), but spc_curpriority is not being set anywhere
+ * except in kern_sig while delivering signals and in kern_synch
+ * after sleeping. The per cpu sched state should have the most
+ * updated p_usrpri so that we can make correct decision whether to kick
+ * the curproc out of spc.
+ *
+ * We do that here, just before cpu_switchto(), as if cpu_switchto()
+ * fails, we are in deep trouble anyways.
+ */
+       spc->spc_curpriority = nextproc->p_usrpri;
+
+/*
+ * Enhancement: It looks to be ok to remain the same proc on the same CPU, if
+ * there is no other proc queued, we have affinity on light load.
+ * Ignore high load for now, it is complex anyway, looking at FreeBSD and 
+ * Dfly, it is tough to get it right under load, opt for simple things first.
+ *
+ * we increment affnty_cnt, if old and new proc are the same, this will
+ * frequently happen under light load. If we design our load balancing
+ * algorithm correctly, cpu affinity should also happen under heavy load.
+ *
+ * An idea: the actual number of proc in SRUN or SONPROC are quite small,
+ * globally try to divide the total proc's in runnable state by number of
+ * desired CPUs to run on, it doesn't matter which CPU a proc runs
+ * initially, during heavy load we should soft-lock it. How does networking
+ * and disk subsystem handle this affinity problem?
+ *
+ * Let's try to fix sched_chooseproc() and sched_choosecpu(), so we
+ * pair proc's to last used CPU.
+ *
+ * In any case, this below is a free boost, if reviewed to be correct.
+ */
+       cpu_switchto(p, nextproc);
+#else
        if (p != nextproc) {
                uvmexp.swtch++;
                cpu_switchto(p, nextproc);
        } else {
                p->p_stat = SONPROC;
        }
-
+#endif
        clear_resched(curcpu());
 
        SCHED_ASSERT_LOCKED();
@@ -427,7 +640,11 @@ mi_switch(void)
         */
        KASSERT(p->p_cpu == curcpu());
 
+#ifdef TSTSCHED
+       getnanouptime(&p->p_cpu->ci_schedstate.spc_runtime);
+#else
        nanouptime(&p->p_cpu->ci_schedstate.spc_runtime);
+#endif
 
 #ifdef MULTIPROCESSOR
        /*
@@ -441,6 +658,8 @@ mi_switch(void)
 #endif
 }
 
+/* this tells curproc on curcpu, get the hell out,
+ * a higher priority thread has landed in the run queue! */
 static __inline void
 resched_proc(struct proc *p, u_char pri)
 {
@@ -514,9 +733,31 @@ resetpriority(struct proc *p)
 
        SCHED_ASSERT_LOCKED();
 
+#ifdef TSTSCHED
+       /* sndiod is penalized when it asks for nice of -20.
+        * Really treat kernel and userland threads differently!
+        * Start with PZERO for kernel threads.
+        *
+        * Calculations need to be reviewed for correctness.
+        */
+       if (p->p_flag & P_SYSTEM) {
+               newpriority = PZERO + p->p_estcpu +
+                       NICE_WEIGHT * (p->p_p->ps_nice - NZERO);
+               if (newpriority > MAXPRI)
+                       newpriority = MAXPRI;
+               if (newpriority < MINPRI)
+                       newpriority = MINPRI;
+       }
+       else {
+               newpriority = PUSER + p->p_estcpu +
+                       NICE_WEIGHT * (p->p_p->ps_nice - NZERO);
+               newpriority = min(newpriority, MAXPRI);
+       }
+#else
        newpriority = PUSER + p->p_estcpu +
            NICE_WEIGHT * (p->p_p->ps_nice - NZERO);
        newpriority = min(newpriority, MAXPRI);
+#endif
        p->p_usrpri = newpriority;
        resched_proc(p, p->p_usrpri);
 }
@@ -542,7 +783,8 @@ schedclock(struct proc *p)
        struct schedstate_percpu *spc = &ci->ci_schedstate;
        int s;
 
-       if (p == spc->spc_idleproc)
+                                       /* @mpi */
+       if (p == spc->spc_idleproc || spc->spc_spinning)
                return;
 
        SCHED_LOCK(s);
diff --git a/sys/sys/param.h b/sys/sys/param.h
index 136a4e48575..8db8dbac12f 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -107,6 +107,10 @@
 #define        PUSER   50
 #define        MAXPRI  127             /* Priorities range from 0 through 
MAXPRI. */
 
+#ifdef TSTSCHED
+#define        MINPRI  0
+#endif
+
 #define        PRIMASK         0x0ff
 #define        PCATCH          0x100   /* OR'd with pri for tsleep to check 
signals */
 #define PNORELOCK      0x200   /* OR'd with pri for msleep to not reaquire
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 1886a7d3826..3a236668355 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -351,6 +351,10 @@ struct proc {
 
        int      p_siglist;             /* Signals arrived but not delivered. */
 
+#ifdef TSTSCHED
+       u_int p_dquantum;               /* dynamic quantum allocated, N * 10 ms 
*/
+       u_int p_cnsd_slice;             /* approx time using a CPU, N * 10 ms. 
*/
+#endif
 /* End area that is zeroed on creation. */
 #define        p_endzero       p_startcopy
 
@@ -391,6 +395,7 @@ struct proc {
 };
 
 /* Status values. */
+/* An idle thread is not flagged here, why not? */
 #define        SIDL    1               /* Thread being created by fork. */
 #define        SRUN    2               /* Currently runnable. */
 #define        SSLEEP  3               /* Sleeping on an address. */
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
index 30b93b4bd3c..ab3de90b56c 100644
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -88,7 +88,9 @@
 #define CP_IDLE                5
 #define CPUSTATES      6
 
+#ifndef TSTSCHED
 #define        SCHED_NQS       32                      /* 32 run queues. */
+#endif
 
 struct smr_entry;
 
@@ -97,21 +99,29 @@ struct smr_entry;
  */
 struct schedstate_percpu {
        struct proc *spc_idleproc;      /* idle proc for this cpu */
+#ifdef TSTSCHED
+       TAILQ_HEAD(prochead, proc) td_runq;
+#else
        TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
+#endif
        LIST_HEAD(,proc) spc_deadproc;
        struct timespec spc_runtime;    /* time curproc started running */
        volatile int spc_schedflags;    /* flags; see below */
        u_int spc_schedticks;           /* ticks for schedclock() */
        u_int64_t spc_cp_time[CPUSTATES]; /* CPU state statistics */
        u_char spc_curpriority;         /* usrpri of curproc */
+#ifndef TSTSCHED
        int spc_rrticks;                /* ticks until roundrobin() */
+#endif
        int spc_pscnt;                  /* prof/stat counter */
        int spc_psdiv;                  /* prof/stat divisor */ 
 
        u_int spc_nrun;                 /* procs on the run queues */
        fixpt_t spc_ldavg;              /* shortest load avg. for this cpu */
 
+#ifndef TSTSCHED
        volatile uint32_t spc_whichqs;
+#endif
        volatile u_int spc_spinning;    /* this cpu is currently spinning */
 
        SIMPLEQ_HEAD(, smr_entry) spc_deferred; /* deferred smr calls */
@@ -134,23 +144,64 @@ struct cpustats {
 #ifdef _KERNEL
 
 /* spc_flags */
+#ifdef TSTSCHED
+#define SPCF_SHOULDYIELD        0x0002  /* process should yield the CPU */
+#else
 #define SPCF_SEENRR             0x0001  /* process has seen roundrobin() */
 #define SPCF_SHOULDYIELD        0x0002  /* process should yield the CPU */
 #define SPCF_SWITCHCLEAR        (SPCF_SEENRR|SPCF_SHOULDYIELD)
+#endif /* TSTSCHED */
 #define SPCF_SHOULDHALT                0x0004  /* CPU should be vacated */
 #define SPCF_HALTED            0x0008  /* CPU has been halted */
 
-#define        SCHED_PPQ       (128 / SCHED_NQS)       /* priorities per queue 
*/
+#ifdef TSTSCHED
+#define NICE_WEIGHT 3                  /* 128 p->p_priority per nice level */
+#define        ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX) /* XXX: revisit 
this */
+#else
 #define NICE_WEIGHT 2                  /* priorities per nice level */
+#define        SCHED_PPQ       (128 / SCHED_NQS)       /* priorities per queue 
*/
 #define        ESTCPULIM(e) min((e), NICE_WEIGHT * PRIO_MAX - SCHED_PPQ)
+#endif
 
 extern int schedhz;                    /* ideally: 16 */
+
+#ifdef TSTSCHED
+extern int swap_initial_wakeup_cnt;    /* 10 sec swapper dly cnt */
+extern int sched_start_dquantum;       /* start different rr intervals
+                                          after kernel has inited fully */
+
+extern int curproc_null_count;         /* count null p=curproc */
+
+extern int hi_rrticks;         /* 40 ms rr intvl */
+extern int med_hi_rrticks;     /* 60 ms rr intvl */
+extern int med_rrticks;                /* 80 ms rr intvl */
+extern int low_med_rrticks;    /* 100 ms rr intvl, orig BSD schdlr */
+extern int low_rrticks;                /* 120 ms rr intvl */
+extern int sys_rrticks;                /* 200 ms rr intvl */
+
+
+extern int hi_rrticks_ctr;     /* 40 ms rr intvl */
+extern int med_hi_rrticks_ctr; /* 60 ms rr intvl */
+extern int med_rrticks_ctr;    /* 80 ms rr intvl */
+extern int low_med_rrticks_ctr;        /* 100 ms rr intvl, orig BSD schdlr */
+extern int low_rrticks_ctr;    /* 120 ms rr intvl */
+extern int sys_rrticks_ctr;    /* 200 ms rr intvl */
+
+extern int affnty_cnt;
+
+#else
 extern int rrticks_init;               /* ticks per roundrobin() */
+#endif
 
 struct proc;
 void schedclock(struct proc *);
 struct cpu_info;
+#ifdef TSTSCHED
+void check_quantum_expiration();
+int get_rr_dquantum(struct proc *);
+#else
 void roundrobin(struct cpu_info *);
+#endif
 void scheduler_start(void);
 void userret(struct proc *p);
 
@@ -178,7 +229,11 @@ void sched_start_secondary_cpus(void);
 void sched_stop_secondary_cpus(void);
 #endif
 
+#ifdef TSTSCHED
+#define cpu_is_idle(ci)        (TAILQ_EMPTY(&ci->ci_schedstate.td_runq))
+#else
 #define cpu_is_idle(ci)        ((ci)->ci_schedstate.spc_whichqs == 0)
+#endif
 int    cpu_is_online(struct cpu_info *);
 
 void sched_init_runqueues(void);
diff --git a/sys/uvm/uvm_glue.c b/sys/uvm/uvm_glue.c
index e17500c47d1..7a3a9c38f8b 100644
--- a/sys/uvm/uvm_glue.c
+++ b/sys/uvm/uvm_glue.c
@@ -329,7 +329,10 @@ int        swapdebug = 0;
 
 
 /*
- * swapout_threads: find threads that can be swapped
+ * swapout_threads: find userland threads that can be swapped
+ *
+ * build a list of largest sleep time and largest memory hogs.
+ * the pass should be efficient in evicting those.
  *
  * - called by the pagedaemon
  * - try and swap at least one processs
@@ -372,6 +375,7 @@ uvm_swapout_threads(void)
                        switch (p->p_stat) {
                        case SRUN:
                        case SONPROC:
+                       case SIDL:
                                goto next_process;
 
                        case SSLEEP:
diff --git a/sys/uvm/uvm_meter.c b/sys/uvm/uvm_meter.c
index 794811366ba..5d6b5b8190b 100644
--- a/sys/uvm/uvm_meter.c
+++ b/sys/uvm/uvm_meter.c
@@ -63,6 +63,16 @@
 int maxslp = MAXSLP;   /* patchable ... */
 struct loadavg averunnable;
 
+#ifdef TSTSCHED
+/* XXX: Can we reuse sched_start_dquantum here?
+ * The objective is to start the swapper after
+ * kernel has inited. Do we run into swap
+ * issues very early in low memory systems even
+ * before going to login screen?
+ * */
+int swap_initial_wakeup_cnt = MAXSLP / 2; /* 10 seconds? */
+#endif
+
 /*
  * constants for averages over 1, 5, and 15 minutes when sampling at
  * 5 second intervals.
@@ -86,8 +96,23 @@ uvm_meter(void)
 {
        if ((time_second % 5) == 0)
                uvm_loadav(&averunnable);
+
+#ifdef TSTSCHED
+       /* XXX: don't depend on p_slptime, this field could go away, if we
+        * move to a different interface for sleeping process.
+        *
+        * we define a constant such that swapper is woken up every 1 sec
+        * after initial 10 secs of inactivity as the previous author intended.
+        */
+
+       if (swap_initial_wakeup_cnt == 0)
+               wakeup(&proc0);
+       else
+               swap_initial_wakeup_cnt--;
+#else
        if (proc0.p_slptime > (maxslp / 2))
                wakeup(&proc0);
+#endif
 }
 
 /*
diff --git a/usr.bin/top/machine.c b/usr.bin/top/machine.c
index 80b10e64b1d..b6e90f25132 100644
--- a/usr.bin/top/machine.c
+++ b/usr.bin/top/machine.c
@@ -283,7 +283,7 @@ get_system_info(struct system_info *si)
 
        size = sizeof(sysload);
        if (sysctl(sysload_mib, 2, &sysload, &size, NULL, 0) < 0)
-               warn("sysctl failed");
+               warn("sysctl sysload_mib failed");
        infoloadp = si->load_avg;
        for (i = 0; i < 3; i++)
                *infoloadp++ = ((double) sysload.ldavg[i]) / sysload.fscale;
@@ -292,12 +292,12 @@ get_system_info(struct system_info *si)
        /* get total -- systemwide main memory usage structure */
        size = sizeof(uvmexp);
        if (sysctl(uvmexp_mib, 2, &uvmexp, &size, NULL, 0) < 0) {
-               warn("sysctl failed");
+               warn("sysctl uvmexp.mib failed");
                bzero(&uvmexp, sizeof(uvmexp));
        }
        size = sizeof(bcstats);
        if (sysctl(bcstats_mib, 3, &bcstats, &size, NULL, 0) < 0) {
-               warn("sysctl failed");
+               warn("sysctl bcstats_mib failed");
                bzero(&bcstats, sizeof(bcstats));
        }
        /* convert memory stats to Kbytes */
@@ -578,7 +578,7 @@ format_next_process(caddr_t hndl, const char 
*(*get_userid)(uid_t, int),
 
        /* format this entry */
        snprintf(fmt, sizeof(fmt), Proc_format, pp->p_pid, buf,
-           pp->p_priority - PZERO, pp->p_nice - NZERO,
+           pp->p_priority, pp->p_nice - NZERO,
            format_k(pagetok(PROCSIZE(pp))),
            format_k(pagetok(pp->p_vm_rssize)),
            (pp->p_stat == SSLEEP && pp->p_slptime > maxslp) ?

scheduler small changes

Reply via email to