[Xen-devel] [PATCH v2 2/2] xen: sched: rtds: use non-atomic bit-ops
Vcpu flags are checked and cleared atomically. Performance can be improved with corresponding non-atomic versions since schedule.c already has spin_locks in place. Signed-off-by: Tianyang Chen <ti...@cis.upenn.edu> --- Changes since v1: -none --- xen/common/sched_rt.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 255d1f6..8ecc908 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -936,7 +936,7 @@ burn_budget(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now) if ( svc->cur_budget <= 0 ) { svc->cur_budget = 0; -set_bit(__RTDS_depleted, >flags); +__set_bit(__RTDS_depleted, >flags); } /* TRACE */ @@ -1050,7 +1050,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched if ( snext != scurr && !is_idle_vcpu(current) && vcpu_runnable(current) ) -set_bit(__RTDS_delayed_runq_add, >flags); +__set_bit(__RTDS_delayed_runq_add, >flags); snext->last_start = now; ret.time = -1; /* if an idle vcpu is picked */ @@ -1059,7 +1059,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched if ( snext != scurr ) { q_remove(snext); -set_bit(__RTDS_scheduled, >flags); +__set_bit(__RTDS_scheduled, >flags); } if ( snext->vcpu->processor != cpu ) { @@ -1093,7 +1093,7 @@ rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) replq_remove(ops, svc); } else if ( svc->flags & RTDS_delayed_runq_add ) -clear_bit(__RTDS_delayed_runq_add, >flags); +__clear_bit(__RTDS_delayed_runq_add, >flags); } /* @@ -1235,7 +1235,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) */ if ( unlikely(svc->flags & RTDS_scheduled) ) { -set_bit(__RTDS_delayed_runq_add, >flags); +__set_bit(__RTDS_delayed_runq_add, >flags); /* * The vcpu is waking up already, and we didn't even had the time to * remove its next replenishment event from the replenishment queue @@ -1266,12 +1266,12 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc) struct rt_vcpu *svc = rt_vcpu(vc); spinlock_t *lock = vcpu_schedule_lock_irq(vc); -clear_bit(__RTDS_scheduled, >flags); +__clear_bit(__RTDS_scheduled, >flags); /* not insert idle vcpu to runq */ if ( is_idle_vcpu(vc) ) goto out; -if ( test_and_clear_bit(__RTDS_delayed_runq_add, >flags) && +if ( __test_and_clear_bit(__RTDS_delayed_runq_add, >flags) && likely(vcpu_runnable(vc)) ) { runq_insert(ops, svc); @@ -1447,7 +1447,7 @@ static void repl_timer_handler(void *data){ runq_tickle(ops, next_on_runq); } else if ( vcpu_on_q(svc) && - test_and_clear_bit(__RTDS_depleted, >flags) ) + __test_and_clear_bit(__RTDS_depleted, >flags) ) runq_tickle(ops, svc); list_del(>replq_elem); -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] [PATCH v2 0/2] xen: sched: rtds clean-up
The first part of this patch series aims at fixing coding syle issue for control structures, as well as aligning comments in the structs. Related discussions: http://lists.xenproject.org/archives/html/xen-devel/2016-03/msg01528.html Because locks are grabbed in schedule.c before hooks are called, underscores in front of function names are removed. The second part replaces atomic bit-ops with non-atomic ones since locks are grabbed in schedule.c. Discussions: http://lists.xenproject.org/archives/html/xen-devel/2016-03/msg01528.html http://www.gossamer-threads.com/lists/xen/devel/431251?do=post_view_threaded#431251 Tianyang Chen (2): xen: sched: rtds code clean-up xen: sched: rtds: use non-atomic bit-ops xen/common/sched_rt.c | 106 ++--- 1 file changed, 56 insertions(+), 50 deletions(-) -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] [PATCH v2 1/2] xen: sched: rtds code clean-up
No functional change: -aligned comments in rt_vcpu struct -removed double underscores from the names of some functions -fixed coding sytle for control structures involving lists -fixed typos in the comments -added comments for UPDATE_LIMIT_SHIFT Signed-off-by: Tianyang Chen <ti...@cis.upenn.edu> --- Changes since v1: -added a detailed list of changes --- xen/common/sched_rt.c | 90 ++--- 1 file changed, 48 insertions(+), 42 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 7f8f411..255d1f6 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -80,7 +80,7 @@ * in schedule.c * * The functions involes RunQ and needs to grab locks are: - *vcpu_insert, vcpu_remove, context_saved, __runq_insert + *vcpu_insert, vcpu_remove, context_saved, runq_insert */ @@ -107,6 +107,12 @@ */ #define RTDS_MIN_BUDGET (MICROSECS(10)) +/* + * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding + * the next deadline, performing addition could be faster if the difference + * between cur_deadline and now is small. If the difference is bigger than + * 1024 * period, use multiplication. + */ #define UPDATE_LIMIT_SHIFT 10 /* @@ -158,12 +164,12 @@ static void repl_timer_handler(void *data); /* - * Systme-wide private data, include global RunQueue/DepletedQ + * System-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ struct rt_private { -spinlock_t lock;/* the global coarse grand lock */ +spinlock_t lock;/* the global coarse-grained lock */ struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ @@ -176,7 +182,7 @@ struct rt_private { * Virtual CPU */ struct rt_vcpu { -struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head q_elem; /* on the runq/depletedq list */ struct list_head replq_elem; /* on the replenishment events list */ /* Up-pointers */ @@ -188,11 +194,11 @@ struct rt_vcpu { s_time_t budget; /* VCPU current infomation in nanosecond */ -s_time_t cur_budget;/* current budget */ -s_time_t last_start;/* last start time */ -s_time_t cur_deadline; /* current deadline for EDF */ +s_time_t cur_budget; /* current budget */ +s_time_t last_start; /* last start time */ +s_time_t cur_deadline; /* current deadline for EDF */ -unsigned flags; /* mark __RTDS_scheduled, etc.. */ +unsigned flags; /* mark __RTDS_scheduled, etc.. */ }; /* @@ -241,13 +247,13 @@ static inline struct list_head *rt_replq(const struct scheduler *ops) * and the replenishment events queue. */ static int -__vcpu_on_q(const struct rt_vcpu *svc) +vcpu_on_q(const struct rt_vcpu *svc) { return !list_empty(>q_elem); } static struct rt_vcpu * -__q_elem(struct list_head *elem) +q_elem(struct list_head *elem) { return list_entry(elem, struct rt_vcpu, q_elem); } @@ -303,7 +309,7 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc) svc->cur_budget, svc->cur_deadline, svc->last_start, -__vcpu_on_q(svc), +vcpu_on_q(svc), vcpu_runnable(svc->vcpu), svc->flags, keyhandler_scratch); @@ -339,28 +345,28 @@ rt_dump(const struct scheduler *ops) replq = rt_replq(ops); printk("Global RunQueue info:\n"); -list_for_each( iter, runq ) +list_for_each ( iter, runq ) { -svc = __q_elem(iter); +svc = q_elem(iter); rt_dump_vcpu(ops, svc); } printk("Global DepletedQueue info:\n"); -list_for_each( iter, depletedq ) +list_for_each ( iter, depletedq ) { -svc = __q_elem(iter); +svc = q_elem(iter); rt_dump_vcpu(ops, svc); } printk("Global Replenishment Events info:\n"); -list_for_each( iter, replq ) +list_for_each ( iter, replq ) { svc = replq_elem(iter); rt_dump_vcpu(ops, svc); } printk("Domain info:\n"); -list_for_each( iter, >sdom ) +list_for_each ( iter, >sdom ) { struct vcpu *v; @@ -380,7 +386,7 @@ rt_dump(const struct scheduler *ops) /* * update deadline and budget when now >= cur_deadline - * it need to be updated to the deadline of the current period + * it needs to be updated to the deadline of the current period */ static void rt_update_deadline(s_time_t now, struct rt_vcpu *svc) @@ -463,14 +469,14 @@ deadline_queue_insert(stru
[Xen-devel] [PATCH 2/2] xen: sched: rtds: use non-atomic bit-ops
Vcpu flags are checked and cleared atomically. Performance can be improved with corresponding non-atomic versions since schedule.c already has spin_locks in place. Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> --- xen/common/sched_rt.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 1584d53..1a18f6d 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -936,7 +936,7 @@ burn_budget(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now) if ( svc->cur_budget <= 0 ) { svc->cur_budget = 0; -set_bit(__RTDS_depleted, >flags); +__set_bit(__RTDS_depleted, >flags); } /* TRACE */ @@ -1050,7 +1050,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched if ( snext != scurr && !is_idle_vcpu(current) && vcpu_runnable(current) ) -set_bit(__RTDS_delayed_runq_add, >flags); +__set_bit(__RTDS_delayed_runq_add, >flags); snext->last_start = now; ret.time = -1; /* if an idle vcpu is picked */ @@ -1059,7 +1059,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched if ( snext != scurr ) { q_remove(snext); -set_bit(__RTDS_scheduled, >flags); +__set_bit(__RTDS_scheduled, >flags); } if ( snext->vcpu->processor != cpu ) { @@ -1093,7 +1093,7 @@ rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) replq_remove(ops, svc); } else if ( svc->flags & RTDS_delayed_runq_add ) -clear_bit(__RTDS_delayed_runq_add, >flags); +__clear_bit(__RTDS_delayed_runq_add, >flags); } /* @@ -1235,7 +1235,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) */ if ( unlikely(svc->flags & RTDS_scheduled) ) { -set_bit(__RTDS_delayed_runq_add, >flags); +__set_bit(__RTDS_delayed_runq_add, >flags); /* * The vcpu is waking up already, and we didn't even had the time to * remove its next replenishment event from the replenishment queue @@ -1266,12 +1266,12 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc) struct rt_vcpu *svc = rt_vcpu(vc); spinlock_t *lock = vcpu_schedule_lock_irq(vc); -clear_bit(__RTDS_scheduled, >flags); +__clear_bit(__RTDS_scheduled, >flags); /* not insert idle vcpu to runq */ if ( is_idle_vcpu(vc) ) goto out; -if ( test_and_clear_bit(__RTDS_delayed_runq_add, >flags) && +if ( __test_and_clear_bit(__RTDS_delayed_runq_add, >flags) && likely(vcpu_runnable(vc)) ) { runq_insert(ops, svc); @@ -1447,7 +1447,7 @@ static void repl_timer_handler(void *data){ runq_tickle(ops, next_on_runq); } else if ( vcpu_on_q(svc) && - test_and_clear_bit(__RTDS_depleted, >flags) ) + __test_and_clear_bit(__RTDS_depleted, >flags) ) runq_tickle(ops, svc); list_del(>replq_elem); -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] [PATCH 1/2] xen: sched: rtds refactor code
No functional change: -Various coding style fix -Added comments for UPDATE_LIMIT_SHIFT. Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> --- xen/common/sched_rt.c | 106 ++--- 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 7f8f411..1584d53 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -80,7 +80,7 @@ * in schedule.c * * The functions involes RunQ and needs to grab locks are: - *vcpu_insert, vcpu_remove, context_saved, __runq_insert + *vcpu_insert, vcpu_remove, context_saved, runq_insert */ @@ -107,6 +107,12 @@ */ #define RTDS_MIN_BUDGET (MICROSECS(10)) +/* + * UPDATE_LIMIT_SHIT: a constant used in rt_update_deadline(). When finding + * the next deadline, performing addition could be faster if the difference + * between cur_deadline and now is small. If the difference is bigger than + * 1024 * period, use multiplication. + */ #define UPDATE_LIMIT_SHIFT 10 /* @@ -158,25 +164,25 @@ static void repl_timer_handler(void *data); /* - * Systme-wide private data, include global RunQueue/DepletedQ + * System-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ struct rt_private { -spinlock_t lock;/* the global coarse grand lock */ -struct list_head sdom; /* list of availalbe domains, used for dump */ -struct list_head runq; /* ordered list of runnable vcpus */ -struct list_head depletedq; /* unordered list of depleted vcpus */ -struct list_head replq; /* ordered list of vcpus that need replenishment */ -cpumask_t tickled; /* cpus been tickled */ -struct timer *repl_timer; /* replenishment timer */ +spinlock_t lock; /* the global coarse grand lock */ +struct list_head sdom; /* list of availalbe domains, used for dump */ +struct list_head runq; /* ordered list of runnable vcpus */ +struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head replq; /* ordered list of vcpus that need replenishment */ +cpumask_t tickled; /* cpus been tickled */ +struct timer *repl_timer;/* replenishment timer */ }; /* * Virtual CPU */ struct rt_vcpu { -struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head q_elem; /* on the runq/depletedq list */ struct list_head replq_elem; /* on the replenishment events list */ /* Up-pointers */ @@ -188,19 +194,19 @@ struct rt_vcpu { s_time_t budget; /* VCPU current infomation in nanosecond */ -s_time_t cur_budget;/* current budget */ -s_time_t last_start;/* last start time */ -s_time_t cur_deadline; /* current deadline for EDF */ +s_time_t cur_budget; /* current budget */ +s_time_t last_start; /* last start time */ +s_time_t cur_deadline; /* current deadline for EDF */ -unsigned flags; /* mark __RTDS_scheduled, etc.. */ +unsigned flags; /* mark __RTDS_scheduled, etc.. */ }; /* * Domain */ struct rt_dom { -struct list_head sdom_elem; /* link list on rt_priv */ -struct domain *dom; /* pointer to upper domain */ +struct list_head sdom_elem; /* link list on rt_priv */ +struct domain *dom; /* pointer to upper domain */ }; /* @@ -241,13 +247,13 @@ static inline struct list_head *rt_replq(const struct scheduler *ops) * and the replenishment events queue. */ static int -__vcpu_on_q(const struct rt_vcpu *svc) +vcpu_on_q(const struct rt_vcpu *svc) { return !list_empty(>q_elem); } static struct rt_vcpu * -__q_elem(struct list_head *elem) +q_elem(struct list_head *elem) { return list_entry(elem, struct rt_vcpu, q_elem); } @@ -303,7 +309,7 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc) svc->cur_budget, svc->cur_deadline, svc->last_start, -__vcpu_on_q(svc), +vcpu_on_q(svc), vcpu_runnable(svc->vcpu), svc->flags, keyhandler_scratch); @@ -339,28 +345,28 @@ rt_dump(const struct scheduler *ops) replq = rt_replq(ops); printk("Global RunQueue info:\n"); -list_for_each( iter, runq ) +list_for_each ( iter, runq ) { -svc = __q_elem(iter); +svc = q_elem(iter); rt_dump_vcpu(ops, svc); } printk("Global DepletedQueue info:\n"); -list_for_each( iter, depletedq ) +list_for_each ( iter, depletedq ) { -svc = __q_elem(iter); +svc = q_elem(iter); rt_dump_vcpu(ops, svc); } printk("Global Replenishment Events info:\n"); -list_fo
[Xen-devel] [PATCH 0/2] xen: sched: rtds refactor code
The first part of this patch series aims at fixing coding syle issue for control structures. Because locks are grabbed in schedule.c before hooks are called, underscores in front of function names are removed. The second part replaces atomic bit-ops with non-atomic ones since locks are grabbed in schedule.c. Discussions: http://lists.xenproject.org/archives/html/xen-devel/2016-03/msg01528.html http://www.gossamer-threads.com/lists/xen/devel/431251?do=post_view_threaded#431251 Tianyang Chen (2): xen: sched: rtds refactor code xen: sched: rtds: use non-atomic bit-ops xen/common/sched_rt.c | 122 ++--- 1 file changed, 64 insertions(+), 58 deletions(-) -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] [PATCH] xen: sched: rtds: refactor code
No functional change: -Various coding style fix -Added comments for UPDATE_LIMIT_SHIFT. Use non-atomic bit-ops: -Vcpu flags are checked and cleared atomically. Performance can be improved with corresponding non-atomic versions since schedule.c already has spin_locks in place. Suggested-by: Dario Faggioli <dario.faggi...@citrix.com> Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> --- xen/common/sched_rt.c | 122 ++--- 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 7f8f411..1a18f6d 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -80,7 +80,7 @@ * in schedule.c * * The functions involes RunQ and needs to grab locks are: - *vcpu_insert, vcpu_remove, context_saved, __runq_insert + *vcpu_insert, vcpu_remove, context_saved, runq_insert */ @@ -107,6 +107,12 @@ */ #define RTDS_MIN_BUDGET (MICROSECS(10)) +/* + * UPDATE_LIMIT_SHIT: a constant used in rt_update_deadline(). When finding + * the next deadline, performing addition could be faster if the difference + * between cur_deadline and now is small. If the difference is bigger than + * 1024 * period, use multiplication. + */ #define UPDATE_LIMIT_SHIFT 10 /* @@ -158,25 +164,25 @@ static void repl_timer_handler(void *data); /* - * Systme-wide private data, include global RunQueue/DepletedQ + * System-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ struct rt_private { -spinlock_t lock;/* the global coarse grand lock */ -struct list_head sdom; /* list of availalbe domains, used for dump */ -struct list_head runq; /* ordered list of runnable vcpus */ -struct list_head depletedq; /* unordered list of depleted vcpus */ -struct list_head replq; /* ordered list of vcpus that need replenishment */ -cpumask_t tickled; /* cpus been tickled */ -struct timer *repl_timer; /* replenishment timer */ +spinlock_t lock; /* the global coarse grand lock */ +struct list_head sdom; /* list of availalbe domains, used for dump */ +struct list_head runq; /* ordered list of runnable vcpus */ +struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head replq; /* ordered list of vcpus that need replenishment */ +cpumask_t tickled; /* cpus been tickled */ +struct timer *repl_timer;/* replenishment timer */ }; /* * Virtual CPU */ struct rt_vcpu { -struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head q_elem; /* on the runq/depletedq list */ struct list_head replq_elem; /* on the replenishment events list */ /* Up-pointers */ @@ -188,19 +194,19 @@ struct rt_vcpu { s_time_t budget; /* VCPU current infomation in nanosecond */ -s_time_t cur_budget;/* current budget */ -s_time_t last_start;/* last start time */ -s_time_t cur_deadline; /* current deadline for EDF */ +s_time_t cur_budget; /* current budget */ +s_time_t last_start; /* last start time */ +s_time_t cur_deadline; /* current deadline for EDF */ -unsigned flags; /* mark __RTDS_scheduled, etc.. */ +unsigned flags; /* mark __RTDS_scheduled, etc.. */ }; /* * Domain */ struct rt_dom { -struct list_head sdom_elem; /* link list on rt_priv */ -struct domain *dom; /* pointer to upper domain */ +struct list_head sdom_elem; /* link list on rt_priv */ +struct domain *dom; /* pointer to upper domain */ }; /* @@ -241,13 +247,13 @@ static inline struct list_head *rt_replq(const struct scheduler *ops) * and the replenishment events queue. */ static int -__vcpu_on_q(const struct rt_vcpu *svc) +vcpu_on_q(const struct rt_vcpu *svc) { return !list_empty(>q_elem); } static struct rt_vcpu * -__q_elem(struct list_head *elem) +q_elem(struct list_head *elem) { return list_entry(elem, struct rt_vcpu, q_elem); } @@ -303,7 +309,7 @@ rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc) svc->cur_budget, svc->cur_deadline, svc->last_start, -__vcpu_on_q(svc), +vcpu_on_q(svc), vcpu_runnable(svc->vcpu), svc->flags, keyhandler_scratch); @@ -339,28 +345,28 @@ rt_dump(const struct scheduler *ops) replq = rt_replq(ops); printk("Global RunQueue info:\n"); -list_for_each( iter, runq ) +list_for_each ( iter, runq ) { -svc = __q_elem(iter); +svc = q_elem(iter); rt_dump_vcpu(ops, svc); } printk("Global DepletedQueue i
[Xen-devel] [PATCH v10]xen: sched: convert RTDS from time to event driven model
The current RTDS code has several problems: - The scheduler, although the algorithm is event driven by nature, follows a time driven model (is invoked periodically!), making the code looks unnatural; - Budget replenishment logic, budget enforcement logic and scheduling decisions are mixed and entangled, making the code hard to understand; - The various queues of vcpus are scanned various times, making the code inefficient; This patch separates budget replenishment and enforcement by adding a replenishment timer, which fires at the next most imminent release time of all runnable vcpus. A replenishment queue has been added to keep track of all vcpus that are runnable. In the main scheduling function, we now return when a scheduling decision should be made next time, such as when the budget of the currently running vcpu runs out. Finally, when waking up a vcpu, it tickles various vcpus appropriately, like all other schedulers do. Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- changes since v9: Re-worded some of the comments Fixed the wrong returned value from rt_schedule(). changes since v8: Replaced spin_lock_irqsave with spin_lock_irq. Bug fix in burn_budget() where budget == 0. Removed unnecessary tickling in the timer handler when vcpus have the same deadline. changes since v7: Coding sytle. Added a flag to indicate that a vcpu was depleted. Added a local list including updated vcpus in the timer handler. It is used to decide which vcpu should tickle. changes since v6: Removed unnecessary vcpu_on_q() checks for runq_insert() Renamed replenishment queue related functions without underscores New replq_reinsert() function for rt_vcpu_wake() changes since v5: Removed unnecessary vcpu_on_replq() checks deadline_queue_insert() returns a flag to indicate if it's needed to re-program the timer Removed unnecessary timer checks Added helper functions to remove vcpus from queues coding style Changes since v4: removed unnecessary replenishment queue checks in vcpu_wake() extended replq_remove() to all cases in vcpu_sleep() used _deadline_queue_insert() helper function for both queues _replq_insert() and _replq_remove() program timer internally Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. --- xen/common/sched_rt.c | 431 ++--- 1 file changed, 335 insertions(+), 96 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index bfed2e2..a43b8b8 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -3,7 +3,10 @@ * EDF scheduling is a real-time scheduling algorithm used in embedded field. * * by Sisu Xi, 2013, Washington University in Saint Louis - * and Meng Xu, 2014, University of Pennsylvania + * Meng Xu, 2014-2016, University of Pennsylvania + * + * Conversion toward event driven model by Tianyang Chen + * and Dagaen Golomb, 2016, University of Pennsylvania * * based on the code of credit Scheduler */ @@ -16,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +91,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + /* * Flags */ @@ -115,6 +119,16 @@ #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add) /* + * RTDS_depleted: Does this vcp run out of budget? + * This flag is + * + set in burn_budget() if a vcpu has zero budget left; + * + cleared and checked in the repenishment handler, + * for the vcpus that are being replenished. + */ +#define __RTDS_depleted 3 +#define RTDS_depleted (1<<__RTDS_depleted) + +/* * rt tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ @@ -142,6 +156,13 @@ static cpumask_var_t *_cpumask_scratch; */ static unsigned int nr_rt_ops; +static void repl_timer_handler(void *data); + +#define deadline_runq_insert(...) \ + deadline_queue_insert(&__q_elem, ##__VA_ARGS__) +#define deadline_replq_insert(...) \ + deadline_queue_insert(_elem, ##__VA_ARGS__) + /* * Systme-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all @@ -152,7 +173,9 @@ struct rt_private { struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head replq; /* ordered list of vcpus that need replenishment */ cpumask_t tickled; /* cpus been tickled */ +struct
[Xen-devel] [PATCH v11]xen: sched: convert RTDS from time to event driven model
The current RTDS code has several problems: - the scheduler, although the algorithm is event driven by nature, follows a time driven model (is invoked periodically!), making the code look unnatural; - budget replenishment logic, budget enforcement logic and scheduling decisions are mixed and entangled, making the code hard to understand; - the various queues of vcpus are scanned various times, making the code inefficient; This patch separates budget replenishment and enforcement. It does that by handling the former with a dedicated timer, and a queue of pending replenishment events. A replenishment queue has been added to keep track of all vcpus that are runnable. We also make sure that the main scheduling function is called when a scheduling decision is necessary, such as when the currently running vcpu runs out of budget. Finally, when waking up a vcpu, it is now enough to tickle the various CPUs appropriately, like all other schedulers also do. Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- changes since v10: Re-worded commit messages. Moved helper macros to where they belong. changes since v9: Re-worded some of the comments Fixed the wrong returned value from rt_schedule(). changes since v8: Replaced spin_lock_irqsave with spin_lock_irq. Bug fix in burn_budget() where budget == 0. Removed unnecessary tickling in the timer handler when vcpus have the same deadline. changes since v7: Coding sytle. Added a flag to indicate that a vcpu was depleted. Added a local list including updated vcpus in the timer handler. It is used to decide which vcpu should tickle. changes since v6: Removed unnecessary vcpu_on_q() checks for runq_insert() Renamed replenishment queue related functions without underscores New replq_reinsert() function for rt_vcpu_wake() changes since v5: Removed unnecessary vcpu_on_replq() checks deadline_queue_insert() returns a flag to indicate if it's needed to re-program the timer Removed unnecessary timer checks Added helper functions to remove vcpus from queues coding style Changes since v4: removed unnecessary replenishment queue checks in vcpu_wake() extended replq_remove() to all cases in vcpu_sleep() used _deadline_queue_insert() helper function for both queues _replq_insert() and _replq_remove() program timer internally Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. --- xen/common/sched_rt.c | 429 ++--- 1 file changed, 333 insertions(+), 96 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index bfed2e2..fc3863e 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -3,7 +3,10 @@ * EDF scheduling is a real-time scheduling algorithm used in embedded field. * * by Sisu Xi, 2013, Washington University in Saint Louis - * and Meng Xu, 2014, University of Pennsylvania + * Meng Xu, 2014-2016, University of Pennsylvania + * + * Conversion toward event driven model by Tianyang Chen + * and Dagaen Golomb, 2016, University of Pennsylvania * * based on the code of credit Scheduler */ @@ -16,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +91,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + /* * Flags */ @@ -115,6 +119,16 @@ #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add) /* + * RTDS_depleted: Does this vcp run out of budget? + * This flag is + * + set in burn_budget() if a vcpu has zero budget left; + * + cleared and checked in the repenishment handler, + * for the vcpus that are being replenished. + */ +#define __RTDS_depleted 3 +#define RTDS_depleted (1<<__RTDS_depleted) + +/* * rt tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ @@ -142,6 +156,8 @@ static cpumask_var_t *_cpumask_scratch; */ static unsigned int nr_rt_ops; +static void repl_timer_handler(void *data); + /* * Systme-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all @@ -152,7 +168,9 @@ struct rt_private { struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head replq; /* ordered list of vcpus that need replenishment */ cpumask_t tickled; /* cpus been tickled */ +struct timer *repl_timer; /* replenishment timer */ }; /* @@ -160,6 +1
[Xen-devel] [PATCH v9]xen: sched: convert RTDS from time to event driven model
Budget replenishment and enforcement are separated by adding a replenishment timer, which fires at the next most imminent release time of all runnable vcpus. A replenishment queue has been added to keep track of all vcpus that are runnable. The following functions have major changes to manage the replenishment events and timer. repl_handler(): It is a timer handler which is re-programmed to fire at the nearest vcpu deadline to replenish vcpus. rt_schedule(): picks the highest runnable vcpu based on cpu affinity and ret.time will be passed to schedule(). If an idle vcpu is picked, -1 is returned to avoid busy-waiting. repl_update() has been removed. rt_vcpu_wake(): when a vcpu wakes up, it tickles instead of picking one from the run queue. rt_context_saved(): when context switching is finished, the preempted vcpu will be put back into the runq. Simplified funtional graph: schedule.c SCHEDULE_SOFTIRQ: rt_schedule(): [spin_lock] burn_budget(scurr) snext = runq_pick() [spin_unlock] sched_rt.c TIMER_SOFTIRQ replenishment_timer_handler() [spin_lock] <for_each_vcpu_on_q(i)> { replenish(i) runq_tickle(i) }> program_timer() [spin_lock] Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- changes since v8: Replaced spin_lock_irqsave with spin_lock_irq. Bug fix in burn_budget() where budget == 0. Removed unnecessary tickling in the timer handler when vcpus have the same deadline. changes since v7: Coding sytle. Added a flag to indicate that a vcpu was depleted. Added a local list including updated vcpus in the timer handler. It is used to decide which vcpu should tickle. changes since v6: Removed unnecessary vcpu_on_q() checks for runq_insert() Renamed replenishment queue related functions without underscores New replq_reinsert() function for rt_vcpu_wake() changes since v5: Removed unnecessary vcpu_on_replq() checks deadline_queue_insert() returns a flag to indicate if it's needed to re-program the timer Removed unnecessary timer checks Added helper functions to remove vcpus from queues coding style Changes since v4: removed unnecessary replenishment queue checks in vcpu_wake() extended replq_remove() to all cases in vcpu_sleep() used _deadline_queue_insert() helper function for both queues _replq_insert() and _replq_remove() program timer internally Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. --- xen/common/sched_rt.c | 437 ++--- 1 file changed, 341 insertions(+), 96 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index bfed2e2..b491915 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -3,7 +3,9 @@ * EDF scheduling is a real-time scheduling algorithm used in embedded field. * * by Sisu Xi, 2013, Washington University in Saint Louis - * and Meng Xu, 2014, University of Pennsylvania + * Meng Xu, 2014-2016, University of Pennsylvania + * Tianyang Chen, 2016, University of Pennsylvania + * Dagaen Golomb, 2016, University of Pennsylvania * * based on the code of credit Scheduler */ @@ -16,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +90,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + /* * Flags */ @@ -115,6 +118,18 @@ #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add) /* + * The replenishment timer handler needs to check this bit + * to know where a replenished vcpu was, when deciding which + * vcpu should tickle. + * A replenished vcpu should tickle if it was moved from the + * depleted queue to the run queue. + * + Set in burn_budget() if a vcpu has zero budget left. + * + Cleared and checked in the repenishment handler. + */ +#define __RTDS_was_depleted 3 +#define RTDS_was_depleted (1<<__RTDS_was_depleted) + +/* * rt tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ @@ -142,6 +157,9 @@ static cpumask_var_t *_cpumask_scratch; */ static unsigned int nr_rt_ops; +/* handler for the replenishment timer */ +static void repl_handler(void *data); + /* * Systme-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all @@ -152,7 +170,9 @@ struct rt_private { struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list
[Xen-devel] [PATCH v8]xen: sched: convert RTDS from time to event driven model
Budget replenishment and enforcement are separated by adding a replenishment timer, which fires at the next most imminent release time of all runnable vcpus. A replenishment queue has been added to keep track of all vcpus that are runnable. The following functions have major changes to manage the replenishment events and timer. repl_handler(): It is a timer handler which is re-programmed to fire at the nearest vcpu deadline to replenish vcpus. rt_schedule(): picks the highest runnable vcpu based on cpu affinity and ret.time will be passed to schedule(). If an idle vcpu is picked, -1 is returned to avoid busy-waiting. repl_update() has been removed. rt_vcpu_wake(): when a vcpu wakes up, it tickles instead of picking one from the run queue. rt_context_saved(): when context switching is finished, the preempted vcpu will be put back into the runq. Simplified funtional graph: schedule.c SCHEDULE_SOFTIRQ: rt_schedule(): [spin_lock] burn_budget(scurr) snext = runq_pick() [spin_unlock] sched_rt.c TIMER_SOFTIRQ replenishment_timer_handler() [spin_lock] <for_each_vcpu_on_q(i)> { replenish(i) runq_tickle(i) }> program_timer() [spin_lock] Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- changes since v7: Coding sytle. Added a flag to indicate that a vcpu was depleted. Added a local list including updated vcpus in the timer handler. It is used to decide which vcpu should tickle. changes since v6: Removed unnecessary vcpu_on_q() checks for runq_insert() Renamed replenishment queue related functions without underscores New replq_reinsert() function for rt_vcpu_wake() changes since v5: Removed unnecessary vcpu_on_replq() checks deadline_queue_insert() returns a flag to indicate if it's needed to re-program the timer Removed unnecessary timer checks Added helper functions to remove vcpus from queues coding style Changes since v4: removed unnecessary replenishment queue checks in vcpu_wake() extended replq_remove() to all cases in vcpu_sleep() used _deadline_queue_insert() helper function for both queues _replq_insert() and _replq_remove() program timer internally Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. --- xen/common/sched_rt.c | 435 ++--- 1 file changed, 340 insertions(+), 95 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index bfed2e2..58189cd 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -3,7 +3,9 @@ * EDF scheduling is a real-time scheduling algorithm used in embedded field. * * by Sisu Xi, 2013, Washington University in Saint Louis - * and Meng Xu, 2014, University of Pennsylvania + * Meng Xu, 2014-2016, University of Pennsylvania + * Tianyang Chen, 2016, University of Pennsylvania + * Dagaen Golomb, 2016, University of Pennsylvania * * based on the code of credit Scheduler */ @@ -16,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +90,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + /* * Flags */ @@ -115,6 +118,18 @@ #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add) /* + * The replenishment timer handler needs to check this bit + * to know where a replenished vcpu was, when deciding which + * vcpu should tickle. + * A replenished vcpu should tickle if it was moved from the + * depleted queue to the run queue. + * + Set in burn_budget() if a vcpu has zero budget left. + * + Cleared and checked in the repenishment handler. + */ +#define __RTDS_was_depleted 3 +#define RTDS_was_depleted (1<<__RTDS_was_depleted) + +/* * rt tracing events ("only" 512 available!). Check * include/public/trace.h for more details. */ @@ -142,6 +157,9 @@ static cpumask_var_t *_cpumask_scratch; */ static unsigned int nr_rt_ops; +/* handler for the replenishment timer */ +static void repl_handler(void *data); + /* * Systme-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all @@ -152,7 +170,9 @@ struct rt_private { struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head replq; /* ordered list of vcpus that need replenishment */ cpumask_t tickled; /* cpus been tickled */ +struct timer *repl_timer; /* replenishm
[Xen-devel] [PATCH v7]xen: sched: convert RTDS from time to event driven model
Budget replenishment and enforcement are separated by adding a replenishment timer, which fires at the next most imminent release time of all runnable vcpus. A replenishment queue has been added to keep track of all replenishment events. The following functions have major changes to manage the replenishment events and timer. repl_handler(): It is a timer handler which is re-programmed to fire at the nearest vcpu deadline to replenish vcpus. rt_schedule(): picks the highest runnable vcpu based on cpu affinity and ret.time will be passed to schedule(). If an idle vcpu is picked, -1 is returned to avoid busy-waiting. repl_update() has been removed. rt_vcpu_wake(): when a vcpu wakes up, it tickles instead of picking one from the run queue. rt_context_saved(): when context switching is finished, the preempted vcpu will be put back into the run queue and it tickles. Simplified funtional graph: schedule.c SCHEDULE_SOFTIRQ: rt_schedule(): [spin_lock] burn_budget(scurr) snext = runq_pick() [spin_unlock] sched_rt.c TIMER_SOFTIRQ replenishment_timer_handler() [spin_lock] <for_each_vcpu_on_q(i)> { replenish(i) runq_tickle(i) }> program_timer() [spin_lock] Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- changes since v6: Removed unnecessary vcpu_on_q() checks for runq_insert() Renamed replenishment queue related functions without underscores New replq_reinsert() function for rt_vcpu_wake() changes since v5: Removed unnecessary vcpu_on_replq() checks deadline_queue_insert() returns a flag to indicate if it's needed to re-program the timer Removed unnecessary timer checks Added helper functions to remove vcpus from queues coding style Changes since v4: removed unnecessary replenishment queue checks in vcpu_wake() extended replq_remove() to all cases in vcpu_sleep() used _deadline_queue_insert() helper function for both queues _replq_insert() and _replq_remove() program timer internally Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. --- xen/common/sched_rt.c | 381 + 1 file changed, 286 insertions(+), 95 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index bfed2e2..06803be 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +88,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + /* * Flags */ @@ -142,6 +143,9 @@ static cpumask_var_t *_cpumask_scratch; */ static unsigned int nr_rt_ops; +/* handler for the replenishment timer */ +static void repl_handler(void *data); + /* * Systme-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all @@ -152,7 +156,9 @@ struct rt_private { struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head replq; /* ordered list of vcpus that need replenishment */ cpumask_t tickled; /* cpus been tickled */ +struct timer *repl_timer; /* replenishment timer */ }; /* @@ -160,6 +166,7 @@ struct rt_private { */ struct rt_vcpu { struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head replq_elem;/* on the repl event list */ /* Up-pointers */ struct rt_dom *sdom; @@ -213,8 +220,14 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops) return _priv(ops)->depletedq; } +static inline struct list_head *rt_replq(const struct scheduler *ops) +{ +return _priv(ops)->replq; +} + /* - * Queue helper functions for runq and depletedq + * Helper functions for manipulating the runqueue, the depleted queue, + * and the replenishment events queue. */ static int __vcpu_on_q(const struct rt_vcpu *svc) @@ -228,6 +241,18 @@ __q_elem(struct list_head *elem) return list_entry(elem, struct rt_vcpu, q_elem); } +static struct rt_vcpu * +replq_elem(struct list_head *elem) +{ +return list_entry(elem, struct rt_vcpu, replq_elem); +} + +static int +vcpu_on_replq(const struct rt_vcpu *svc) +{ + return !list_empty(>replq_elem); +} + /* * Debug related code, dump vcpu/cpu information */ @@ -288,7 +313,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu) static void rt_dump(const struct scheduler *ops) { -struc
Re: [Xen-devel] [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
On 2/26/2016 4:11 AM, Dario Faggioli wrote: It looks like the current code doesn't add a vcpu to the replenishment queue when vcpu_insert() is called. When the scheduler is initialized, all the vcpus are added to the replenishment queue after waking up from sleep. This needs to be changed (add vcpu to replq in vcpu_insert()) to make it consistent in a sense that when rt_vcpu_insert() is called, it needs to have a corresponding replenishment event queued. This way the ASSERT() is for both cases in __runq_insert() to enforce the fact that "when a vcpu is inserted to runq/depletedq, a replenishment event is waiting for it". I am ok with this (calling replq_insert() in rt_vcpu_insert()). Not just doing that unconditionally though, as a vcpu can, e.g., be paused when the insert_vcpu hook is called, and in that case, I don't think we want to enqueue the replenishment event, do we? Yes. static void rt_context_saved(const struct scheduler *ops, struct vcpu *vc) { struct rt_vcpu *svc = rt_vcpu(vc); spinlock_t *lock = vcpu_schedule_lock_irq(vc); clear_bit(__RTDS_scheduled, >flags); /* not insert idle vcpu to runq */ if ( is_idle_vcpu(vc) ) goto out; if ( test_and_clear_bit(__RTDS_delayed_runq_add, >flags) && likely(vcpu_runnable(vc)) ) { __runq_insert(ops, svc); runq_tickle(ops, snext); } else __replq_remove(ops, svc); out: vcpu_schedule_unlock_irq(lock, vc); } And, as said above, if you do this, try also removing the __replq_remove() call from rt_vcpu_sleep(), this one should cover for that (and, actually, more than just that!). After all this, check whether you still have the assert in __replq_insert() triggering and let me know So after moving the __replq_remove() to rt_context_saved(), the assert in __replq_insert() still fails when dom0 boots up. Well, maybe removing __replq_remove() from rt_vcpu_sleep() is not entirely ok, as if we do that we fail to deal with the case of when (still in rt_vcpu_sleep()), __vcpu_on_q(svc) is true. So, I'd say, do as I said above wrt rt_context_saved(). For rt_vcpu_sleep(), you can try something like this: static void rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) { struct rt_vcpu * const svc = rt_vcpu(vc); BUG_ON( is_idle_vcpu(vc) ); SCHED_STAT_CRANK(vcpu_sleep); if ( curr_on_cpu(vc->processor) == vc ) cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); else if ( __vcpu_on_q(svc) ) { __q_remove(svc); __replq_remove(svc); <=== *** LOOK HERE *** } else if ( svc->flags & RTDS_delayed_runq_add ) clear_bit(__RTDS_delayed_runq_add, >flags); } In fact, in both the first and the third case, we go will at some point pass through rt_context_switch(), and hit the __replq_remove() that I made you put there. In the case in the middle, as the vcpu was just queued, and for making it go to sleep, it is enough to remove it from the runq (or depletedq, in the case of this scheduler), we won't go through rt_schedule()-->rt_context_saved(), and hence the __replq_remove() won't be called. Sorry for the overlook, can you try this. That being said... (XEN) Assertion '!vcpu_on_replq(svc)' failed at sched_rt.c:524 (XEN) [ Xen-4.7-unstable x86_64 debug=y Tainted:C ] (XEN) Xen call trace: (XEN)[] sched_rt.c#__replq_insert+0x2b/0x64 (XEN)[] sched_rt.c#rt_vcpu_wake+0xf2/0x12c (XEN)[] vcpu_wake+0x213/0x3d4 (XEN)[] vcpu_unblock+0x4b/0x4d (XEN)[] vcpu_kick+0x20/0x6f (XEN)[] vcpu_mark_events_pending+0x2c/0x2f (XEN)[] event_2l.c#evtchn_2l_set_pending+0xa9/0xb9 (XEN)[] evtchn_send+0x158/0x183 (XEN)[] do_event_channel_op+0xe21/0x147d (XEN)[] lstar_enter+0xe2/0x13c (XEN) ... This says that the we call __replq_insert() from rt_vcpu_wake() and find out in there that vcpu_on_replq() is true. However, in v6 code for rt_vcpu_wake(), I can see this: +if( !__vcpu_on_replq(svc) ) +{ +__replq_insert(ops, svc); +ASSERT( vcpu_runnable(vc) ); +} which would make me think that, if vcpu_on_replq() is true, we shouldn't be calling __replq_insert() in the first place. So, have you made other changes wrt v6 when trying this? The v6 doesn't have the if statement commented out when I submitted it. But I tried commenting it out, the assertion failed. It fails in V6 with: rt_vcpu_sleep(): removing replenishment event for all cases rt_context_saved(): not removing replenishment events rt_vcpu_wake(): not checking if the event is already queued. or with: rt_vcpu_sleep(): not removing replenishment event at all rt_context_saved(): removing replenishment events if not runnable rt_vcpu_wake(): not checking if the event is already queued. Also with: rt_vcpu_sleep(): removing replenishment event if the vcpu is on runq/depletedq rt_context_saved(): removing replenishment events if not runnable rt_vcpu_wake(): not
Re: [Xen-devel] [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
On 2/25/2016 6:31 PM, Dario Faggioli wrote: Hey again, Thanks for turning up so quickly. We are getting closer and closer, although (of course :-)) I have some more comments. However, is there a particular reason why you are keeping the RFC tag? Until you do that, it's like saying that you are chasing feedback, but you do not think yourself the patch should be considered for being upstreamed. As far as my opinion goes, this patch is not ready to go in right now (as I said, I've got questions and comments), but its status is way past RFC. Oh OK, I had the impression that RFC means request for comments and it should always be used because indeed, I'm asking for comments. On Thu, 2016-02-25 at 15:05 -0500, Tianyang Chen wrote: changes since v5: removed unnecessary vcpu_on_replq() checks deadline_queue_insert() returns a flag to indicate if it's needed to re-program the timer removed unnecessary timer checks added helper functions to remove vcpus from queues coding style Changes since v4: removed unnecessary replenishment queue checks in vcpu_wake() extended replq_remove() to all cases in vcpu_sleep() used _deadline_queue_insert() helper function for both queues _replq_insert() and _replq_remove() program timer internally Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. So, this does not belong here. It is ok to have it in this part of the email, but it should not land in the actual commit changelog, once the patch will be committed into Xen's git tree. The way to achieve the above is to put this summary of changes below the actual changelog, and below the Signed-of-by lines, after a marker that looks like this "---". Budget replenishment and enforcement are separated by adding a replenishment timer, which fires at the next most imminent release time of all runnable vcpus. A new runningq has been added to keep track of all vcpus that are on pcpus. Mmm.. Is this the proper changelog? runningq is something we discussed, and that appeared in v2, but is certainly no longer here... :-O oops... Wait, maybe you misunderstood and/or I did not make myself clear enough (in which case, sorry). I never meant to say "always stop the timer". Atually, in one of my last email I said the opposite, and I think that would be the best thing to do. Do you think there is any specific reason why we need to always stop and restart it? If not, I think we can: - have deadline_queue_remove() also return whether the element removed was the first one (i.e., the one the timer was programmed after); - if it was, re-program the timer after the new front of the queue; - if it wasn't, nothing to do. Thoughts? It was my thought originally that the timer needs to be re-programmed only when the top vcpu is taken off. So did you mean when I manipulated repl_timer->expires manually, the timer should be stopped using proper timer API? The manipulation is gone now. Also, set_timer internally disables the timer so I assume it's safe just to call set_timer() directly, right? @@ -405,22 +500,38 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) /* add svc to runq if svc still has budget */ if ( svc->cur_budget > 0 ) -{ -list_for_each(iter, runq) -{ -struct rt_vcpu * iter_svc = __q_elem(iter); -if ( svc->cur_deadline <= iter_svc->cur_deadline ) -break; - } -list_add_tail(>q_elem, iter); -} +deadline_queue_insert(&__q_elem, svc, >q_elem, runq); else { list_add(>q_elem, >depletedq); +ASSERT( __vcpu_on_replq(svc) ); So, by doing this, you're telling me that, if the vcpu is added to the depleted queue, there must be a replenishment planned for it (or the ASSERT() would fail). But if we are adding it to the runq, there may or may not be a replenishment planned for it. Is this what we want? Why there must not be a replenishment planned already for a vcpu going into the runq (to happen at its next deadline)? It looks like the current code doesn't add a vcpu to the replenishment queue when vcpu_insert() is called. When the scheduler is initialized, all the vcpus are added to the replenishment queue after waking up from sleep. This needs to be changed (add vcpu to replq in vcpu_insert()) to make it consistent in a sense that when rt_vcpu_insert() is called, it needs to have a corresponding replenishment event queued. This way the ASSERT() is for both cases in __runq_insert() to enforce the fact that "when a vcpu is inserted to runq/depletedq, a replenishment event is waiting for it". @@ -1087,10 +1193,6 @@ static void rt_context_saved(const struct scheduler *ops, struct vcpu *vc) { stru
[Xen-devel] [PATCH v6][RFC]xen: sched: convert RTDS from time to event driven model
changes since v5: removed unnecessary vcpu_on_replq() checks deadline_queue_insert() returns a flag to indicate if it's needed to re-program the timer removed unnecessary timer checks added helper functions to remove vcpus from queues coding style Changes since v4: removed unnecessary replenishment queue checks in vcpu_wake() extended replq_remove() to all cases in vcpu_sleep() used _deadline_queue_insert() helper function for both queues _replq_insert() and _replq_remove() program timer internally Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. Budget replenishment and enforcement are separated by adding a replenishment timer, which fires at the next most imminent release time of all runnable vcpus. A new runningq has been added to keep track of all vcpus that are on pcpus. The following functions have major changes to manage the runningq and replenishment repl_handler(): It is a timer handler which is re-programmed to fire at the nearest vcpu deadline to replenish vcpus on runq, depeletedq and runningq. When the replenishment is done, each replenished vcpu should tickle a pcpu to see if it needs to preempt any running vcpus. rt_schedule(): picks the highest runnable vcpu based on cpu affinity and ret.time will be passed to schedule(). If an idle vcpu is picked, -1 is returned to avoid busy-waiting. repl_update() has been removed. rt_vcpu_wake(): when a vcpu is awake, it tickles instead of picking one from runq. When a vcpu wakes up, it might reprogram the timer if it has a more recent release time. rt_context_saved(): when context switching is finished, the preempted vcpu will be put back into the runq. Runningq is updated and picking from runq and tickling are removed. Simplified funtional graph: schedule.c SCHEDULE_SOFTIRQ: rt_schedule(): [spin_lock] burn_budget(scurr) snext = runq_pick() [spin_unlock] sched_rt.c TIMER_SOFTIRQ replenishment_timer_handler() [spin_lock] <for_each_vcpu_on_q(i)> { replenish(i) runq_tickle(i) }> program_timer() [spin_lock] Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- xen/common/sched_rt.c | 328 - 1 file changed, 244 insertions(+), 84 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 2e5430f..16f77f9 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +88,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + /* * Flags */ @@ -142,6 +143,9 @@ static cpumask_var_t *_cpumask_scratch; */ static unsigned int nr_rt_ops; +/* handler for the replenishment timer */ +static void repl_handler(void *data); + /* * Systme-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all @@ -152,7 +156,9 @@ struct rt_private { struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head replq; /* ordered list of vcpus that need replenishment */ cpumask_t tickled; /* cpus been tickled */ +struct timer *repl_timer; /* replenishment timer */ }; /* @@ -160,6 +166,7 @@ struct rt_private { */ struct rt_vcpu { struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head replq_elem;/* on the repl event list */ /* Up-pointers */ struct rt_dom *sdom; @@ -213,8 +220,14 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops) return _priv(ops)->depletedq; } +static inline struct list_head *rt_replq(const struct scheduler *ops) +{ +return _priv(ops)->replq; +} + /* - * Queue helper functions for runq and depletedq + * Queue helper functions for runq, depletedq + * and replenishment event queue */ static int __vcpu_on_q(const struct rt_vcpu *svc) @@ -228,6 +241,18 @@ __q_elem(struct list_head *elem) return list_entry(elem, struct rt_vcpu, q_elem); } +static struct rt_vcpu * +__replq_elem(struct list_head *elem) +{ +return list_entry(elem, struct rt_vcpu, replq_elem); +} + +static int +__vcpu_on_replq(const struct rt_vcpu *svc) +{ + return !list_empty(>replq_elem); +} + /* * Debug related code, dump vcpu/cpu information */ @@ -288,7 +313,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu) static void rt_dump(const s
Re: [Xen-devel] [PATCH v5][RFC]xen: sched: convert RTDS from time to event driven model
On 2/25/2016 5:34 AM, Dario Faggioli wrote: + * it should be re-inserted back to the replenishment queue. + */ +if ( now >= svc->cur_deadline) +{ +rt_update_deadline(now, svc); +__replq_remove(ops, svc); +} + +if( !__vcpu_on_replq(svc) ) +__replq_insert(ops, svc); + And here I am again: is it really necessary to check whether svc is not in the replenishment queue? It looks to me that it really should not be there... but maybe it can, because we remove the event from the queue when the vcpu sleeps, but *not* when the vcpu blocks? Yeah. That is the case where I keep getting assertion failure if it's removed. Which one ASSERT() fails? The replq_insert() fails because it's already on the replenishment queue when rt_vcpu_wake() is trying to insert a vcpu again. (XEN) Assertion '!__vcpu_on_replq(svc)' failed at sched_rt.c:527 (XEN) [ Xen-4.7-unstable x86_64 debug=y Tainted:C ] (XEN) CPU:0 (XEN) RIP:e008:[] sched_rt.c#rt_vcpu_wake+0xf0/0x17f (XEN) RFLAGS: 00010002 CONTEXT: hypervisor (d0v0) (XEN) rax: 0001 rbx: 83023b522940 rcx: 0001 (XEN) rdx: 0031bb1b9980 rsi: 82d080342318 rdi: 83023b486ca0 (XEN) rbp: 8300bfcffd88 rsp: 8300bfcffd58 r8: 0004 (XEN) r9: deadbeef r10: 82d08025f5c0 r11: 0206 (XEN) r12: 83023b486ca0 r13: 8300bfd46000 r14: 82d080299b80 (XEN) r15: 83023b522d80 cr0: 80050033 cr4: 000406a0 (XEN) cr3: 000231c0d000 cr2: 880001e80ba8 (XEN) ds: es: fs: gs: ss: e010 cs: e008 (XEN) Xen stack trace from rsp=8300bfcffd58: (XEN)8300bfcffd70 8300bfd46000 000216110572 83023b522940 (XEN)82d08032bc00 0282 8300bfcffdd8 82d08012be0c (XEN)83023b4b5000 83023b4f1000 8300bfd47000 8300bfd46000 (XEN) 83023b4b4280 00014440 0001 (XEN)8300bfcffde8 82d08012c327 8300bfcffe08 82d080169cea (XEN)83023b4b5000 000a 8300bfcffe18 82d080169d65 (XEN)8300bfcffe38 82d08010762a 83023b4b4280 83023b4b5000 (XEN)8300bfcffe68 82d08010822a 8300bfcffe68 fff2 (XEN)88022056dcb4 880230c34440 8300bfcffef8 82d0801096fc (XEN)8300bfcffef8 8300bfcfff18 8300bfcffef8 82d080240e85 (XEN)88020001 0246 810013aa (XEN)000a 810013aa e030 8300bfd47000 (XEN)8802206597f0 880230c34440 00014440 0001 (XEN)7cff403000c7 82d0802439e2 8100140a 0020 (XEN)88022063c7d0 88022063c7d0 0001 dca0 (XEN)88022056dcb8 880230c34440 0206 0004 (XEN)8802230001a0 880220619000 0020 8100140a (XEN) 88022056dcb4 0004 00010100 (XEN)8100140a e033 0206 88022056dc90 (XEN)e02b (XEN) Xen call trace: (XEN)[] sched_rt.c#rt_vcpu_wake+0xf0/0x17f (XEN)[] vcpu_wake+0x213/0x3d4 (XEN)[] vcpu_unblock+0x4b/0x4d (XEN)[] vcpu_kick+0x20/0x6f (XEN)[] vcpu_mark_events_pending+0x2c/0x2f (XEN)[] event_2l.c#evtchn_2l_set_pending+0xa9/0xb9 (XEN)[] evtchn_send+0x158/0x183 (XEN)[] do_event_channel_op+0xe21/0x147d (XEN)[] lstar_enter+0xe2/0x13c (XEN) (XEN) (XEN) (XEN) Panic on CPU 0: (XEN) Assertion '!__vcpu_on_replq(svc)' failed at sched_rt.c:527 (XEN) I'm thinking when a vcpu unblocks, it could potentially fall through here. Well, when unblocking, wake() is certainly called, yes. And like you said, mostly spurious sleep happens when a vcpu is running and it could happen in other cases, although rare. I think I said already there's no such thing as "spurious sleep". Or at least, I can't think of anything that I would define a spurious sleep, if you do, please, explain what situation you're referring to. I meant to say spurious wakeup... If rt_vcpu_sleep() removes vcpus from replenishment queue, it's perfectly safe for rt_vcpu_wake() to insert them back. So, I'm suspecting it's the spurious wakeup that's causing troubles because vcpus are not removed prior to rt_vcpu_wake(). However, the two RETURNs at the beginning of rt_vcpu_wake() should catch that shouldn't it? In any case, one way of dealing with vcpus blocking/offlining/etc could be to, in context_saved(), in case we are not adding the vcpu back to the runq, cancel its replenishment event with __replq_remove(). (This may make it possible to avoid doing it in rt_vcpu_sleep() too, but you'll need to check and test.) Can
Re: [Xen-devel] [PATCH v5][RFC]xen: sched: convert RTDS from time to event driven model
On 2/24/2016 9:02 PM, Dario Faggioli wrote: Hey, Here I am, sorry for the delay. :-( No problem, I think we are almost there. On Mon, 2016-02-08 at 23:33 -0500, Tianyang Chen wrote: Changes since v4: removed unnecessary replenishment queue checks in vcpu_wake() extended replq_remove() to all cases in vcpu_sleep() used _deadline_queue_insert() helper function for both queues _replq_insert() and _replq_remove() program timer internally Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> So, the actual changelog... why did it disappear? :-) I will add it in the next version. diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 2e5430f..1f0bb7b 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -388,6 +424,66 @@ __q_remove(struct rt_vcpu *svc) } /* + * Removing a vcpu from the replenishment queue could + * re-program the timer for the next replenishment event + * if the timer is currently active + */ +static inline void +__replq_remove(const struct scheduler *ops, struct rt_vcpu *svc) +{ +struct rt_private *prv = rt_priv(ops); +struct list_head *replq = rt_replq(ops); +struct timer* repl_timer = prv->repl_timer; + +if ( __vcpu_on_replq(svc) ) +{ So, can this be false? If yes, when and why? +/* + * disarm the timer if removing the first replenishment event + * which is going to happen next + */ +if( active_timer(repl_timer) ) +{ And here again, isn't it the case that, if there is a vcpu in the replenishment queue, then the timer _must_ be active? (I.e., if the above is true, isn't this also always true?) So, the reason for this check is that the code inside timer handler also calls this function when timer is stopped. We don't want to start the timer inside the timer handler when it's manipulating the replenishment queue right? Because it could be removing/adding back more than one vcpu and the timer should only be set at the very end of the timer handler. In fact, I copied a static function from timer.c just to check if a timer is active or not. Not sure if this is a good practice or not. +struct rt_vcpu *next_repl = __replq_elem(replq->next); + +if( next_repl->cur_deadline == svc->cur_deadline ) +repl_timer->expires = 0; + I think we need to call stop_timer() here, don't we? I know that set_timer() does that itself, but I think it is indeed necessary. E.g., right now, you're manipulating the timer without the timer lock. So when the timer handler is protected by a global lock, it is still necessary to stop the timer? Also, the timer only runs on a single CPU for a scheduler too. +list_del_init(>replq_elem); + +/* re-arm the timer for the next replenishment event */ +if( !list_empty(replq) ) +{ +struct rt_vcpu *svc_next = __replq_elem(replq- next); +set_timer(repl_timer, svc_next->cur_deadline); +} +} + +else +list_del_init(>replq_elem); I don't like the structure of this function, and I'd ask for a different if/then/else logic to be used, but let's first see --by answering the questions above-- if some of these if-s can actually go away. :-) So this function reduces to: /* * disarm the timer if removing the first replenishment event * which is going to happen next */ if( active_timer(repl_timer) ) { stop_timer(replq_timer); repl_timer->expires = 0; list_del_init(>replq_elem); /* re-arm the timer for the next replenishment event */ if( !list_empty(replq) ) { struct rt_vcpu *svc_next = __replq_elem(replq->next); set_timer(repl_timer, svc_next->cur_deadline); } } else list_del_init(>replq_elem); +} +} + +/* + * An utility function that inserts a vcpu to a + * queue based on certain order (EDF) End long comments with a full stop. + */ +static void +_deadline_queue_insert(struct rt_vcpu * (*_get_q_elem)(struct list_head *elem), There's really a lot of "underscore-prefixing" in this file. This is, of course, not your fault, and should not be addressed in this patch. But, at least when it comes to new code, let's avoid making things worse. So, "deadline_queue_insert()" is just ok, I think. Yes, I know I did suggest to call it how it's called in this patch, underscore included, but I now think it would be better to get
Re: [Xen-devel] [PATCH v5][RFC]xen: sched: convert RTDS from time to event driven model
On 2/15/2016 10:55 PM, Meng Xu wrote: Hi Tianyang, Thanks for the patch! Great work and really quick action! :-) I will just comment on something I quickly find out and would look forwarding to Dario's comment. On Mon, Feb 8, 2016 at 11:33 PM, Tianyang Chen <ti...@seas.upenn.edu <mailto:ti...@seas.upenn.edu>> wrote: > Changes since v4: > removed unnecessary replenishment queue checks in vcpu_wake() > extended replq_remove() to all cases in vcpu_sleep() > used _deadline_queue_insert() helper function for both queues > _replq_insert() and _replq_remove() program timer internally > > Changes since v3: > removed running queue. > added repl queue to keep track of repl events. > timer is now per scheduler. > timer is init on a valid cpu in a cpupool. > > Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu <mailto:ti...@seas.upenn.edu>> > Signed-off-by: Meng Xu <men...@cis.upenn.edu <mailto:men...@cis.upenn.edu>> > Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu <mailto:dgol...@seas.upenn.edu>> > --- > xen/common/sched_rt.c | 337 - > 1 file changed, 251 insertions(+), 86 deletions(-) > > diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c > index 2e5430f..1f0bb7b 100644 > --- a/xen/common/sched_rt.c > +++ b/xen/common/sched_rt.c > @@ -16,6 +16,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -87,7 +88,7 @@ > #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) > > #define UPDATE_LIMIT_SHIFT 10 > -#define MAX_SCHEDULE(MILLISECS(1)) > + > /* > * Flags > */ > @@ -142,6 +143,12 @@ static cpumask_var_t *_cpumask_scratch; > */ > static unsigned int nr_rt_ops; > > +/* handler for the replenishment timer */ > +static void repl_handler(void *data); > + > +/* checks if a timer is active or not */ > +bool_t active_timer(struct timer* t); > + > /* > * Systme-wide private data, include global RunQueue/DepletedQ > * Global lock is referenced by schedule_data.schedule_lock from all > @@ -152,7 +159,9 @@ struct rt_private { > struct list_head sdom; /* list of availalbe domains, used for dump */ > struct list_head runq; /* ordered list of runnable vcpus */ > struct list_head depletedq; /* unordered list of depleted vcpus */ > +struct list_head replq; /* ordered list of vcpus that need replenishment */ > cpumask_t tickled; /* cpus been tickled */ > +struct timer *repl_timer; /* replenishment timer */ > }; > > /* > @@ -160,6 +169,7 @@ struct rt_private { > */ > struct rt_vcpu { > struct list_head q_elem;/* on the runq/depletedq list */ > +struct list_head replq_elem;/* on the repl event list */ > > /* Up-pointers */ > struct rt_dom *sdom; > @@ -213,8 +223,14 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops) > return _priv(ops)->depletedq; > } > > +static inline struct list_head *rt_replq(const struct scheduler *ops) > +{ > +return _priv(ops)->replq; > +} > + > /* > - * Queue helper functions for runq and depletedq > + * Queue helper functions for runq, depletedq > + * and replenishment event queue > */ > static int > __vcpu_on_q(const struct rt_vcpu *svc) > @@ -228,6 +244,18 @@ __q_elem(struct list_head *elem) > return list_entry(elem, struct rt_vcpu, q_elem); > } > > +static struct rt_vcpu * > +__replq_elem(struct list_head *elem) > +{ > +return list_entry(elem, struct rt_vcpu, replq_elem); > +} > + > +static int > +__vcpu_on_replq(const struct rt_vcpu *svc) > +{ > + return !list_empty(>replq_elem); > +} > + > /* > * Debug related code, dump vcpu/cpu information > */ > @@ -288,7 +316,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu) > static void > rt_dump(const struct scheduler *ops) > { > -struct list_head *runq, *depletedq, *iter; > +struct list_head *runq, *depletedq, *replq, *iter; > struct rt_private *prv = rt_priv(ops); > struct rt_vcpu *svc; > struct rt_dom *sdom; > @@ -301,6 +329,7 @@ rt_dump(const struct scheduler *ops) > > runq = rt_runq(ops); > depletedq = rt_depletedq(ops); > +replq = rt_replq(ops); > > printk("Global RunQueue info:\n"); > list_for_each( iter, runq ) > @@ -316,6 +345,13 @@ rt_dump(const struct scheduler *ops) > rt_dump_vcpu(ops, svc); > } > > +
[Xen-devel] [PATCH v5][RFC]xen: sched: convert RTDS from time to event driven model
Changes since v4: removed unnecessary replenishment queue checks in vcpu_wake() extended replq_remove() to all cases in vcpu_sleep() used _deadline_queue_insert() helper function for both queues _replq_insert() and _replq_remove() program timer internally Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- xen/common/sched_rt.c | 337 - 1 file changed, 251 insertions(+), 86 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 2e5430f..1f0bb7b 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +88,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + /* * Flags */ @@ -142,6 +143,12 @@ static cpumask_var_t *_cpumask_scratch; */ static unsigned int nr_rt_ops; +/* handler for the replenishment timer */ +static void repl_handler(void *data); + +/* checks if a timer is active or not */ +bool_t active_timer(struct timer* t); + /* * Systme-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all @@ -152,7 +159,9 @@ struct rt_private { struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head replq; /* ordered list of vcpus that need replenishment */ cpumask_t tickled; /* cpus been tickled */ +struct timer *repl_timer; /* replenishment timer */ }; /* @@ -160,6 +169,7 @@ struct rt_private { */ struct rt_vcpu { struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head replq_elem;/* on the repl event list */ /* Up-pointers */ struct rt_dom *sdom; @@ -213,8 +223,14 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops) return _priv(ops)->depletedq; } +static inline struct list_head *rt_replq(const struct scheduler *ops) +{ +return _priv(ops)->replq; +} + /* - * Queue helper functions for runq and depletedq + * Queue helper functions for runq, depletedq + * and replenishment event queue */ static int __vcpu_on_q(const struct rt_vcpu *svc) @@ -228,6 +244,18 @@ __q_elem(struct list_head *elem) return list_entry(elem, struct rt_vcpu, q_elem); } +static struct rt_vcpu * +__replq_elem(struct list_head *elem) +{ +return list_entry(elem, struct rt_vcpu, replq_elem); +} + +static int +__vcpu_on_replq(const struct rt_vcpu *svc) +{ + return !list_empty(>replq_elem); +} + /* * Debug related code, dump vcpu/cpu information */ @@ -288,7 +316,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu) static void rt_dump(const struct scheduler *ops) { -struct list_head *runq, *depletedq, *iter; +struct list_head *runq, *depletedq, *replq, *iter; struct rt_private *prv = rt_priv(ops); struct rt_vcpu *svc; struct rt_dom *sdom; @@ -301,6 +329,7 @@ rt_dump(const struct scheduler *ops) runq = rt_runq(ops); depletedq = rt_depletedq(ops); +replq = rt_replq(ops); printk("Global RunQueue info:\n"); list_for_each( iter, runq ) @@ -316,6 +345,13 @@ rt_dump(const struct scheduler *ops) rt_dump_vcpu(ops, svc); } +printk("Global Replenishment Event info:\n"); +list_for_each( iter, replq ) +{ +svc = __replq_elem(iter); +rt_dump_vcpu(ops, svc); +} + printk("Domain info:\n"); list_for_each( iter, >sdom ) { @@ -388,6 +424,66 @@ __q_remove(struct rt_vcpu *svc) } /* + * Removing a vcpu from the replenishment queue could + * re-program the timer for the next replenishment event + * if the timer is currently active + */ +static inline void +__replq_remove(const struct scheduler *ops, struct rt_vcpu *svc) +{ +struct rt_private *prv = rt_priv(ops); +struct list_head *replq = rt_replq(ops); +struct timer* repl_timer = prv->repl_timer; + +if ( __vcpu_on_replq(svc) ) +{ +/* + * disarm the timer if removing the first replenishment event + * which is going to happen next + */ +if( active_timer(repl_timer) ) +{ +struct rt_vcpu *next_repl = __replq_elem(replq->next); + +if( next_repl->cur_deadline == svc->cur_deadline ) +repl_timer->expires = 0; + +list_del_init(>re
Re: [Xen-devel] [PATCH v4] xen: sched: convert RTDS from time to event driven model
On 2/8/2016 6:27 AM, Dario Faggioli wrote: On Fri, 2016-02-05 at 23:27 -0500, Tianyang Chen wrote: On 2/5/2016 9:39 AM, Dario Faggioli wrote: On Wed, 2016-02-03 at 21:23 -0500, Tianyang Chen wrote: I see. So I'm just curious what can cause spurious wakeup? Does it only happen to a running vcpu (currently on pcpu), and a vcpu that is on either runq or depletedq? Hehe, "that's virt, baby!" :-P No, seriously, vcpu being already running or already in a queue, is indeed what makes me call a wakeup a "spurious" wakeup. You can check at the performance counters that we update in those cases, to see when they happen most. In my experience, on Credit, I've seen them happening mostly when a vcpu is running. For instance: root@Zhaman:~# xenperf |grep vcpu_wake sched: vcpu_wake_runningT=39 sched: vcpu_wake_onrunq T= 0 sched: vcpu_wake_runnable T= 59875 sched: vcpu_wake_not_runnable T= 0 So we realized that this is just a spurious wakeup, and get back to what we were doing. What's wrong with this I just said? The reason why I wanted to add a vcpu back to replq is because it could be taken off from the timer handler. Now, because of the spurious wakeup, I think the timer shouldn't take any vcpus off of replq, in which case wake() should be doing nothing just like other schedulers when it's a spurious wakeup. Also, only sleep() should remove events from replq. Err... well, that depends on the how the code ends up looking like, and it's start to become difficult to reason on it like this. Perhaps, considering that it looks to me that we are in agreement on all the important aspects, you can draft a new version and we'll reason and comment on top of that? Mmm... no, I think we should know/figure out whether a replenishment is pending already and we are ok with it, or if we need a new one. Just to make sure, spurious sleep/wakeup are for a vcpu that is on pcpu or any queue right? Yes, spurious wakeup is how I'm calling wakeups that needs no action from the schedule, because the vcpu is already awake (and either running or runnable). I don't think there is such a thing as spurious sleep... When sleep is called, we go to sleep. There are cases where the sleep-->wakeup turnaround is really really fast, but I would not call them "spurious sleeps", and they should not need much special treatment, not in sched_rt.c at least. Oh, maybe you mean the cases where you wakeup and you find out that context_saved() has not run yet (just occurred by looking at the code below)? Well, yes... But I wouldn't call those "spurious sleeps" either. If a real sleep happens, it should be taken off from replq. However, in spurious wakeup (which I assume corresponds to a spurious sleep?), it shouldn't be taken off from replq. Adding back to replq should happen for those vcpus which were taken off because of "real sleep". I don't really follow, but I have the feeling that you're making it sound more complicated like it is in reality... :-) So my reasoning is that, when sleep() is called in sched_rt.c, a vcpu will need to be taken off from the replenishment event queue. However, a vcpu can be put to sleep/not-runnable without even calling sleep(), which corresponds to a later "spurious wakeup". Is there any way that RTDS can know when this happens? If not, in rt_vcpu_wake(), it needs to check, in all situations, if a vcpu is on the replenishment event queue or not. If not, add it back. I'm I right? vcpu running --> sleep(), taken off from replq vcpu on queue --> sleep(), taken off from replq vcpu not on queue --> sleep(), taken off from replq However, vcpu running/on queque/not on queue --> just not runnable anymore (still on replq) --> spurious wakeup (still on replq) vcpus shouldn't be added back to replq again. So in all cases, rt_vcpu_wake() always need to check if a vcpu is on the replq or not. Now I think because of the addition of a replenishment queue, spurious wake up cannot be simply treated as NOP. Just like V4, the "Out" label basically maintains the replenishment queue. Anyways, I agree it's gonna be easier to reason on v5 where all changes are applied. Thanks, Tianyang ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH v4] xen: sched: convert RTDS from time to event driven model
On 2/5/2016 9:39 AM, Dario Faggioli wrote: On Wed, 2016-02-03 at 21:23 -0500, Tianyang Chen wrote: On 2/3/2016 7:39 AM, Dario Faggioli wrote: On Tue, 2016-02-02 at 13:09 -0500, Tianyang Chen wrote: So what do we do, we raise the *_delayed_runq_add flag and continue and leave the vcpu alone. This happens, for instance, when there is a race between scheduling out a vcpu that is going to sleep, and the vcpu in question being woken back up already! It's not frequent, but we need to take care of that. At some point (sooner rather tan later, most likely) the context switch will actually happen, and the vcpu that is waking up is put in the runqueue. Yeah I agree with this situation. But I meant to say when a vcpu is waking up while it's still on a queue. See below. So, if a vcpu wakes up while it is running on a pcpu already, or while it is already woken and has already been put in the runqueue, these are SPURIOUS WAKEUP, which should be dealt with by just doing nothing, as every scheduler is doing This is what I'm tring to say since a couple of email, let's see if I've said it clear enough this time. :-D I see. So I'm just curious what can cause spurious wakeup? Does it only happen to a running vcpu (currently on pcpu), and a vcpu that is on either runq or depletedq? static void rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { struct rt_vcpu * const svc = rt_vcpu(vc); s_time_t now = NOW(); struct rt_private *prv = rt_priv(ops); BUG_ON( is_idle_vcpu(vc) ); if ( unlikely(curr_on_cpu(vc->processor) == vc) ) { SCHED_STAT_CRANK(vcpu_wake_running); return; } For this situation, a vcpu is waking up on a pcpu. It could be taken off from the replq inside the timer handler at the end. So, if we decide to not replenish any sleeping/un-runnable vcpus any further, we should probably add it back to replq here right? I'm sorry, I don't understand what you're saying here. The vcpu woke up a few time ago. We went through here. At the time it was not on any pcpu and it was not in the runqueue, so we did plan a replenishment event for it in the replenishment queue, at it's next deadline. Now we're here again, for some reason. We figure out that the vcpu is running already and, most likely (feel free to add an ASSERT() inside the if to that effect), it will also still have its replenishment event queued. So we realized that this is just a spurious wakeup, and get back to what we were doing. What's wrong with this I just said? The reason why I wanted to add a vcpu back to replq is because it could be taken off from the timer handler. Now, because of the spurious wakeup, I think the timer shouldn't take any vcpus off of replq, in which case wake() should be doing nothing just like other schedulers when it's a spurious wakeup. Also, only sleep() should remove events from replq. /* on RunQ/DepletedQ, just update info is ok */ if ( unlikely(__vcpu_on_q(svc)) ) { SCHED_STAT_CRANK(vcpu_wake_onrunq); return; } For this situation, a vcpu could be going through the following phases: on runq --> sleep --> (for a long time) --> wake up Well, but then we're not here, because when it went to sleep, it's been taken off the runqueue, hasn't it? Actually, I do think that you need to be running to go to sleep, but anyway, even if a vcpu could go to sleep or block from the runqueue, it's not going to stay in the runqueue, and the first wakeup that occurs does not find it on the runqueue, and hence is not covered by this case... Is it? uh, I missed the __q_remove() from sleep. It will not end up here in the case I mentioned. When it wakes up, it could stay in the front of the runq because cur_deadline hasn't been updated. It will, inherently have some high priority-ness (because of EDF) and be picked over other vcpus right? Do we want to update it's deadline and budget here and re-insert it accordingly? When it wakes up and it is found not to be either running or in the runqueue already, yes, we certainly want to do that! But if some other (again, spurious) wakeup occurs, and find it already in the runqueue (i.e., this case) we actually *don't* want to update its deadline again. Agree. if ( likely(vcpu_runnable(vc)) ) SCHED_STAT_CRANK(vcpu_wake_runnable); else SCHED_STAT_CRANK(vcpu_wake_not_runnable); if ( now >= svc->cur_deadline) { rt_update_deadline(now, svc); __replq_remove(svc); } __replq_insert(svc); For here, consider this scenario: on pcpu (originally on replq) --> sleep --> context_saved() but still asleep --> wake() Timer handler isn't triggered before it awakes. Then this vcpu will be added to replq again while it's already there. I'm not 100% positive if this situation is possible though. But judging from rt_context_saved(), it checks if a vcp
Re: [Xen-devel] [PATCH v4] xen: sched: convert RTDS from time to event driven model
On 2/3/2016 7:39 AM, Dario Faggioli wrote: On Tue, 2016-02-02 at 13:09 -0500, Tianyang Chen wrote: On 2/2/2016 10:08 AM, Dario Faggioli wrote: On Sun, 2016-01-31 at 23:32 -0500, Tianyang Chen wrote: +struct rt_private *prv = rt_priv(ops); +struct list_head *replq = rt_replq(ops); +struct list_head *iter; + +ASSERT( spin_is_locked(>lock) ); + Is this really useful? Considering where this is called, I don't think it is. This basically comes from __q_insert(). I see. Well it's still not necessary, IMO, so don't add it. At some point, we may want to remove it from __runq_insert() too. The bottom line is: prv->lock is, currently, the runqueue lock (it's the lock for everything! :-D). It is pretty obvious that you need the runq lock to manipulate the runqueue. It is not the runqueue that you are manipulating here, it is the replenishment queue, but as a matter of fact, prv->lock serializes that too. So, if anything, it would be more useful to add a comment somewhere, noting (reinforcing, I should probably say) the point that also this new queue for replenishment is being serialized by prv->lock, than an assert like this one. Sure I have been searching for the definitions of vcpu_schedule_lock_irq() and pcpu_schedule_lock_irq(). Do you know where they are defined? I did some name search in the entire xen directory but still nothing. They are auto-generated. Look in xen/include/xen/sched-if.h. +ASSERT( !__vcpu_on_p(svc) ); + +list_for_each(iter, replq) +{ +struct rt_vcpu * iter_svc = __p_elem(iter); +if ( svc->cur_deadline <= iter_svc->cur_deadline ) +break; +} + +list_add_tail(>p_elem, iter); +} This looks ok. The list_for_each()+list_ad_tail() part would be basically identical to what we have inside __runq_insert(), thgough, wouldn't it? It may make sense to define an _ordered_queue_insert() (or _deadline_queue_insert(), since it's alwas the deadline that is compared) utility function that we can then call in both cases. @@ -449,6 +507,7 @@ rt_init(struct scheduler *ops) INIT_LIST_HEAD(>sdom); INIT_LIST_HEAD(>runq); INIT_LIST_HEAD(>depletedq); +INIT_LIST_HEAD(>replq); Is there a reason why you don't do at least the allocation of the timer here? Because, to me, this looks the best place for that. cpumask_clear(>tickled); @@ -473,6 +532,9 @@ rt_deinit(const struct scheduler *ops) xfree(_cpumask_scratch); _cpumask_scratch = NULL; } + +kill_timer(prv->repl_timer); + xfree(prv); And here, you're freeing prv, without having freed prv->timer, which is therefore leaked. @@ -632,6 +699,17 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) vcpu_schedule_unlock_irq(lock, vc); SCHED_STAT_CRANK(vcpu_insert); + +if( prv->repl_timer == NULL ) +{ +/* vc->processor has been set in schedule.c */ +cpu = vc->processor; + +repl_timer = xzalloc(struct timer); + +prv->repl_timer = repl_timer; +init_timer(repl_timer, repl_handler, (void *)ops, cpu); +} } This belong to rt_init, IMO. When a new pool is created, the corresponding scheduler is also initialized. But there is no pcpu assigned to that pool. Sure, and in fact, above, when commenting on rt_init() I said "Is there a reason why you don't do at least the allocation of the timer here?" But you're right, here I put it like all should belong to rt_init(), I should have been more clear. If init_timer() happens in rt_init, how does it know which cpu to pick? When move_domain() is called, there must be some pcpus in the destination pool and then vcpu_insert happens. Allocating memory for the rt_priv copy of this particular scheduler instance, definitely belong in rt_init(). Then, in this case, we could do the allocation in rt_init() and do the initialization later, perhaps here, or the first time that the timer needs to be started, or when the first pCPU is assigned to this scheduler (by being assigned to the cpupool this scheduler services). I honestly prefer the latter. That is to say, in rt_alloc_pdata(), you check whether prv->repl_timer is NULL and you allocate and initialize it for the pCPU being brought up. I also would put an explicit 'prv- repl_timer=NULL', with a comment that proper allocation and initialization will happen later, and the reason why that is the case, in rt_init(). What do you think? Yeah it makes sense to put some comments in rt_init to remind readers that the timer will be taken care of later in alloc_pdata(). @@ -841,7 +881,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched /* burn_budget would return for IDLE VCPU */ burn_budget(ops, scurr, now); -__repl_update(ops, now); __repl_update() going away is of course fine. But we need to enact the budget
Re: [Xen-devel] [Bug] sched: credit2: Assertion failed
On 2/2/2016 3:29 AM, Dario Faggioli wrote: On Tue, 2016-02-02 at 01:18 -0500, Tianyang Chen wrote: The following script caused an unresponsive dom0 and it can not be reproduced all the time. The dom0 is using credit2 scheduler. #!/bin/bash xl cpupool-list -c xl cpupool-cpu-remove Pool-0 3 xl cpupool-cpu-remove Pool-0 2 xl cpupool-create name=\"r\" sched=\"credit\" xl cpupool-create name=\"t\" sched=\"credit\" xl cpupool-cpu-add r 3 xl cpupool-cpu-add t 2 Yes, this is known. It is because Credit2 does not support (hard) affinity. There is a series on the list by someone implementing that, but it needs refreshing, which I hope to get round to it soon. Another bug can be reproduced(not all the time) if the newly created pool is also credit2. There is an assertion failure (see bug.txt), which is essentially the same thing I got when trying to make the new rtds scheduler cpupool safe. This is also known, and it will be fixed when I'll get back and resubmit this: http://lists.xenproject.org/archives/html/xen-devel/2015-09/msg03811.html In fact, look at this patch: http://lists.xenproject.org/archives/html/xen-devel/2015-09/msg03813.html And I'll do that right in this days, so I'd say you can ignore this for RTDS (as it will get fixed by the same series). So both are known issues... I'll open bugs in our bugtraker for them to avoid forgetting or tripping over them again. Got it, thanks for the heads up. -- Tianyang ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH v4] xen: sched: convert RTDS from time to event driven model
On 2/2/2016 10:08 AM, Dario Faggioli wrote: On Sun, 2016-01-31 at 23:32 -0500, Tianyang Chen wrote: v4 is meant for discussion on the addition of replq. In which cases, you should always put something like RFC in the subject, so people knows what the intent is even by just skimming their inboxes. Got it. Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. Bugs to be fixed: Cpupool and locks. When a pcpu is removed from a pool and added to another, the lock equality assert in free_pdata() fails when Pool-0 is using rtds. Known issue. I will fix it for both Credit2 and RTDS, so just ignore it. Yep. I was wondering if I should fix this before sending out v4. diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 2e5430f..c36e5de 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +88,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + Unrelated to the topic. Can we have a dedicated patch that adds a comment above UPDATE_LIMIT_SHIFT, explainig what it is and how it's used. This is something low priority, of course. Sure. @@ -160,6 +166,7 @@ struct rt_private { */ struct rt_vcpu { struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head p_elem;/* on the repl event list */ Mmm... 'p_elem' because the previous one was 'q_elem', and 'p' follows 'q', I guess. It feels a bit obscure, though, and I'd prefer a more 'talking' name. For now, and at this level, I guess I can live with it. But in other places, things need to be improved (see below). @@ -213,8 +220,14 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops) return _priv(ops)->depletedq; } +static inline struct list_head *rt_replq(const struct scheduler *ops) +{ +return _priv(ops)->replq; +} + /* - * Queue helper functions for runq and depletedq + * Queue helper functions for runq, depletedq + * and repl event q */ So, in comments, can we use full words as much as possible. It makes them a lot easier to read (so, e.g., "replenishment events queue" or "queue of the replenishment events"). I know this very file's style is not like that, from this point of view... and, in fact, this is something I would like to change, when we get the chance (i.e., when touching the code for other reasons, like in this case). @@ -228,6 +241,18 @@ __q_elem(struct list_head *elem) return list_entry(elem, struct rt_vcpu, q_elem); } +static struct rt_vcpu * +__p_elem(struct list_head *elem) +{ +return list_entry(elem, struct rt_vcpu, p_elem); +} + So, while this may well still be fine... +static int +__vcpu_on_p(const struct rt_vcpu *svc) +{ + return !list_empty(>p_elem); +} + ...here I really would like to go for something that will make it much more obvious, just by giving a look at the code, where we are actually checking the vcpu to be, without having to remember that p is the replenishment queue. So, I don't know, maybe something like vcpu_on_replq(), or vcpu_needs_replenish() ? @@ -387,6 +412,13 @@ __q_remove(struct rt_vcpu *svc) list_del_init(>q_elem); } +static inline void +__p_remove(struct rt_vcpu *svc) +{ +if ( __vcpu_on_p(svc) ) +list_del_init(>p_elem); +} + replq_remove(), or vcpu_repl_cancel() (or something else) ? @@ -421,6 +453,32 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) } /* + * Insert svc into the repl even list: + * vcpus that needs to be repl earlier go first. + */ +static void +__replq_insert(const struct scheduler *ops, struct rt_vcpu *svc) +{ This is exactly what I've been trying to say above: __replq_insert() is ok, much better than __q_insert()! Do the same everywhere else. :-) Indeed, some of the names are confusing the first time I looked at RTDS. +struct rt_private *prv = rt_priv(ops); +struct list_head *replq = rt_replq(ops); +struct list_head *iter; + +ASSERT( spin_is_locked(>lock) ); + Is this really useful? Considering where this is called, I don't think it is. This basically comes from __q_insert(). I have been searching for the definitions of vcpu_schedule_lock_irq() and pcpu_schedule_lock_irq(). Do you know where they are defined? I did some name search in the entire xen directory but still nothing. +ASSERT( !__vcpu_on_p(svc) ); + +list_for_each(iter, replq) +{ +struct rt_vcpu * iter_svc = __p_elem(iter); +if ( svc->cur_deadline <= iter_svc->cur_deadline ) +break; +} + +list_add_tail(>p_elem, iter); +} This looks ok. The list_for_each()+list_ad_tail() part w
[Xen-devel] [Bug] sched: credit2: Assertion failed
The following script caused an unresponsive dom0 and it can not be reproduced all the time. The dom0 is using credit2 scheduler. #!/bin/bash xl cpupool-list -c xl cpupool-cpu-remove Pool-0 3 xl cpupool-cpu-remove Pool-0 2 xl cpupool-create name=\"r\" sched=\"credit\" xl cpupool-create name=\"t\" sched=\"credit\" xl cpupool-cpu-add r 3 xl cpupool-cpu-add t 2 Another bug can be reproduced(not all the time) if the newly created pool is also credit2. There is an assertion failure (see bug.txt), which is essentially the same thing I got when trying to make the new rtds scheduler cpupool safe. With different combinations of schedulers and number of pools created, dom0 is either unresponsive or kernel panics. Since it happens when a pcpu(free) is added to another pool, I'm guessing the place to look at is schedule_cpu_switch(), where a series of alloc() and free() are called. And indeed at the last free_pdata(), the assertion fails. Both of them happen on 4.7 unstable, git:2e46e3f Any idea on this is appreciated. -- Tianyang Chen Xen 4.7-unstable (XEN) Xen version 4.7-unstable (root@) (gcc (Ubuntu/Linaro 4.6.3-1ubuntu5) 4.6.3) debug=y Tue Feb 2 00:41:08 EST 2016 (XEN) Latest ChangeSet: Fri Jan 22 16:19:51 2016 +0100 git:2e46e3f (XEN) Console output is synchronous. (XEN) Bootloader: GRUB 1.99-21ubuntu3.16 (XEN) Command line: placeholder sched=credit2 loglvl=all guest_loglvl=all sync_console console_to_ring com1=115200,8n1,0x2f8,3 console=com1 (XEN) Video information: (XEN) VGA is text mode 80x25, font 8x16 (XEN) Disc information: (XEN) Found 1 MBR signatures (XEN) Found 1 EDD information structures (XEN) Xen-e820 RAM map: (XEN) - 0009f400 (usable) (XEN) 0009f400 - 000a (reserved) (XEN) 000ca000 - 000cc000 (reserved) (XEN) 000dc000 - 0010 (reserved) (XEN) 0010 - bfee (usable) (XEN) bfee - bfeff000 (ACPI data) (XEN) bfeff000 - bff0 (ACPI NVS) (XEN) bff0 - c000 (usable) (XEN) e000 - f000 (reserved) (XEN) fec0 - fec1 (reserved) (XEN) fee0 - fee01000 (reserved) (XEN) fffe - 0001 (reserved) (XEN) 0001 - 00024000 (usable) (XEN) ACPI: RSDP 000F6B50, 0024 (r2 PTLTD ) (XEN) ACPI: XSDT BFEE2F86, 005C (r1 INTEL 440BX 604 VMW 1324272) (XEN) ACPI: FACP BFEFEE98, 00F4 (r4 INTEL 440BX 604 PTL F4240) (XEN) ACPI: DSDT BFEE35B8, 1B8E0 (r1 PTLTD Custom604 MSFT 301) (XEN) ACPI: FACS BFEFFFC0, 0040 (XEN) ACPI: BOOT BFEE3590, 0028 (r1 PTLTD $SBFTBL$ 604 LTP1) (XEN) ACPI: APIC BFEE338E, 0202 (r1 PTLTDAPIC604 LTP0) (XEN) ACPI: MCFG BFEE3352, 003C (r1 PTLTD $PCITBL$ 604 LTP1) (XEN) ACPI: SRAT BFEE3082, 02D0 (r2 VMWARE MEMPLUG 604 VMW 1) (XEN) ACPI: HPET BFEE304A, 0038 (r1 VMWARE VMW HPET 604 VMW 1) (XEN) ACPI: WAET BFEE3022, 0028 (r1 VMWARE VMW WAET 604 VMW 1) (XEN) System RAM: 8191MB (8388092kB) (XEN) SRAT: PXM 0 -> APIC 00 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 01 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 02 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 03 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 04 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 05 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 06 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 07 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 08 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 09 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 0a -> Node 0 (XEN) SRAT: PXM 0 -> APIC 0b -> Node 0 (XEN) SRAT: PXM 0 -> APIC 0c -> Node 0 (XEN) SRAT: PXM 0 -> APIC 0d -> Node 0 (XEN) SRAT: PXM 0 -> APIC 0e -> Node 0 (XEN) SRAT: PXM 0 -> APIC 0f -> Node 0 (XEN) SRAT: PXM 0 -> APIC 10 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 11 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 12 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 13 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 14 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 15 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 16 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 17 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 18 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 19 -> Node 0 (XEN) SRAT: PXM 0 -> APIC 1a -> Node 0 (XEN) SRAT: PXM 0 -> APIC 1b -> Node 0 (XEN) SRAT: PXM 0 -> APIC 1c -> Node 0 (XEN) SRAT: PXM 0 -> APIC 1d -> Node 0 (XEN) SRAT: PXM 0 -> APIC 1e -> Node 0 (XEN) SRAT: PXM 0 -> APIC 1f -> Node 0 (XEN) SRAT: Node 0 PXM 0 0-a (XEN) SRAT: Node 0 PXM 0 10-c000 (XEN) SRAT: Node 0 PXM 0 1-24000 (XEN) SRAT: Node 0 PXM 0 24000-204000 (hotplug) (XEN) NUMA: Allocated memnodemap from 23b531000 - 23b552000 (XEN) NUMA: Using 8 for the hash shift. (XEN) Domain heap initialised (XEN) found SMP MP-table at 000f6bc0 (XE
[Xen-devel] [PATCH v4] xen: sched: convert RTDS from time to event driven model
v4 is meant for discussion on the addition of replq. Changes since v3: removed running queue. added repl queue to keep track of repl events. timer is now per scheduler. timer is init on a valid cpu in a cpupool. Bugs to be fixed: Cpupool and locks. When a pcpu is removed from a pool and added to another, the lock equality assert in free_pdata() fails when Pool-0 is using rtds. This patch is based on master branch after commit 2e46e3 x86/mce: fix misleading indentation in init_nonfatal_mce_checker() Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- xen/common/sched_rt.c | 262 - 1 file changed, 192 insertions(+), 70 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 2e5430f..c36e5de 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +88,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + /* * Flags */ @@ -142,6 +143,9 @@ static cpumask_var_t *_cpumask_scratch; */ static unsigned int nr_rt_ops; +/* handler for the replenishment timer */ +static void repl_handler(void *data); + /* * Systme-wide private data, include global RunQueue/DepletedQ * Global lock is referenced by schedule_data.schedule_lock from all @@ -152,7 +156,9 @@ struct rt_private { struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head replq; /* ordered list of vcpus that need repl */ cpumask_t tickled; /* cpus been tickled */ +struct timer *repl_timer; /* replenishment timer */ }; /* @@ -160,6 +166,7 @@ struct rt_private { */ struct rt_vcpu { struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head p_elem;/* on the repl event list */ /* Up-pointers */ struct rt_dom *sdom; @@ -213,8 +220,14 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops) return _priv(ops)->depletedq; } +static inline struct list_head *rt_replq(const struct scheduler *ops) +{ +return _priv(ops)->replq; +} + /* - * Queue helper functions for runq and depletedq + * Queue helper functions for runq, depletedq + * and repl event q */ static int __vcpu_on_q(const struct rt_vcpu *svc) @@ -228,6 +241,18 @@ __q_elem(struct list_head *elem) return list_entry(elem, struct rt_vcpu, q_elem); } +static struct rt_vcpu * +__p_elem(struct list_head *elem) +{ +return list_entry(elem, struct rt_vcpu, p_elem); +} + +static int +__vcpu_on_p(const struct rt_vcpu *svc) +{ + return !list_empty(>p_elem); +} + /* * Debug related code, dump vcpu/cpu information */ @@ -387,6 +412,13 @@ __q_remove(struct rt_vcpu *svc) list_del_init(>q_elem); } +static inline void +__p_remove(struct rt_vcpu *svc) +{ +if ( __vcpu_on_p(svc) ) +list_del_init(>p_elem); +} + /* * Insert svc with budget in RunQ according to EDF: * vcpus with smaller deadlines go first. @@ -421,6 +453,32 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) } /* + * Insert svc into the repl even list: + * vcpus that needs to be repl earlier go first. + */ +static void +__replq_insert(const struct scheduler *ops, struct rt_vcpu *svc) +{ +struct rt_private *prv = rt_priv(ops); +struct list_head *replq = rt_replq(ops); +struct list_head *iter; + +ASSERT( spin_is_locked(>lock) ); + +ASSERT( !__vcpu_on_p(svc) ); + +list_for_each(iter, replq) +{ +struct rt_vcpu * iter_svc = __p_elem(iter); +if ( svc->cur_deadline <= iter_svc->cur_deadline ) +break; +} + +list_add_tail(>p_elem, iter); +} + + +/* * Init/Free related code */ static int @@ -449,6 +507,7 @@ rt_init(struct scheduler *ops) INIT_LIST_HEAD(>sdom); INIT_LIST_HEAD(>runq); INIT_LIST_HEAD(>depletedq); +INIT_LIST_HEAD(>replq); cpumask_clear(>tickled); @@ -473,6 +532,9 @@ rt_deinit(const struct scheduler *ops) xfree(_cpumask_scratch); _cpumask_scratch = NULL; } + +kill_timer(prv->repl_timer); + xfree(prv); } @@ -586,6 +648,7 @@ rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) return NULL; INIT_LIST_HEAD(>q_elem); +INIT_LIST_HEAD(>p_elem); svc->flags = 0U; svc->sdom = dd; svc->vcpu = vc; @@ -618,6 +681,10 @@ static void rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) { struct rt_vcpu *svc = rt_vc
Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
On 1/26/2016 12:28 PM, Meng Xu wrote: Hi Dario and Tianyang, On Tue, Jan 26, 2016 at 9:06 AM, Dario Faggioli <dario.faggi...@citrix.com> wrote: On Mon, 2016-01-25 at 18:00 -0500, Meng Xu wrote: On Mon, Jan 25, 2016 at 5:04 PM, Tianyang Chen <ti...@seas.upenn.edu> wrote: I have removed some of the Ccs so they won't get bothered as we discussed previously. On 1/25/2016 4:00 AM, Dario Faggioli wrote: On Thu, 2015-12-31 at 05:20 -0500, Tianyang Chen wrote: @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops; * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ + +/* dedicated timer for replenishment */ +static struct timer repl_timer; + So, there's always only one timer... Even if we have multiple cpupool with RTDS as their scheduler, they share the replenishment timer? I think it makes more sense to make this per-scheduler. Yeah, I totally ignored the case for cpu-pools. It looks like when a cpu-pool is created, it copies the scheduler struct and calls rt_init() where a private field is initialized. So I assume the timer should be put inside the scheduler private struct? Now that I think about it, the timer is hard-coded to run on cpu0. If there're lots of cpu-pools but the replenishment can only be done on the same pcpu, would that be a problem? Should we keep track of all instances of schedulers (nr_rt_ops counts how many) and just put times on different pcpus? +/* controls when to first start the timer*/ +static int timer_started; + I don't like this, and I don't think we need it. In fact, you removed it yourself from v3, AFAICT. @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ list_add_tail(>sdom_elem, >sdom->vcpu); + +if(!timer_started) +{ +/* the first vcpu starts the timer for the first time*/ +timer_started = 1; +set_timer(_timer,svc->cur_deadline); +} } This also seems to be gone in v3, which is good. In fact, it uses timer_started, which I already said I didn't like. About the actual startup of the timer (no matter whether for first time or not). Here, you were doing it in _vcpu_insert() and not in _vcpu_wake(); in v3 you're doing it in _vcpu_wake() and not in _runq_insert()... Which one is the proper way? Correct me if I'm wrong, at the beginning of the boot process, all vcpus are put to sleep/not_runnable after insertions. Therefore, the timer should start when the first vcpu wakes up. I think the wake() in v3 should be correct. @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask) } /* - * Update vcpu's budget and - * sort runq by insert the modifed vcpu back to runq - * lock is grabbed before calling this function - */ -static void -__repl_update(const struct scheduler *ops, s_time_t now) -{ Please, allow me to say that seeing this function going away, fills my heart with pure joy!! :-D @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched } } -ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */ +ret.time = snext->budget; /* invoke the scheduler next time */ ret.task = snext->vcpu; This is ok as it is done in v3 (i.e., snext->budget if !idle, -1 if idle). @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) /* insert svc to runq/depletedq because svc is not in queue now */ __runq_insert(ops, svc); -__repl_update(ops, now); - -ASSERT(!list_empty(>sdom)); -sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); -online = cpupool_scheduler_cpumask(sdom->dom->cpupool); -snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */ - -runq_tickle(ops, snext); +runq_tickle(ops, svc); And this is another thing I especially like of this patch: it makes the wakeup path a lot simpler and a lot more similar to how it looks like in the other schedulers. Good job with this. :-) @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc) if ( test_and_clear_bit(__RTDS_delayed_runq_add, flags) && likely(vcpu_runnable(vc)) ) { +/* only insert the pre-empted vcpu back*/ __runq_insert(ops, svc); -__repl_update(ops, NOW()); - -ASSERT(!list_empty(>sdom)); -sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); -online = cpupool_scheduler_cpumask(sdom->dom- cpupool); -snext = __runq_pick(ops, online); /* pick snext from ALL cpus */ - -runq_tickle(ops, snext); } Mmm... I'll think about this more and let you know... But out of the top of my head, I think the tickling has to stay? You preempted a vcpu from th
Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
I have removed some of the Ccs so they won't get bothered as we discussed previously. On 1/25/2016 4:00 AM, Dario Faggioli wrote: On Thu, 2015-12-31 at 05:20 -0500, Tianyang Chen wrote: @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops; * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ + +/* dedicated timer for replenishment */ +static struct timer repl_timer; + So, there's always only one timer... Even if we have multiple cpupool with RTDS as their scheduler, they share the replenishment timer? I think it makes more sense to make this per-scheduler. Yeah, I totally ignored the case for cpu-pools. It looks like when a cpu-pool is created, it copies the scheduler struct and calls rt_init() where a private field is initialized. So I assume the timer should be put inside the scheduler private struct? Now that I think about it, the timer is hard-coded to run on cpu0. If there're lots of cpu-pools but the replenishment can only be done on the same pcpu, would that be a problem? Should we keep track of all instances of schedulers (nr_rt_ops counts how many) and just put times on different pcpus? +/* controls when to first start the timer*/ +static int timer_started; + I don't like this, and I don't think we need it. In fact, you removed it yourself from v3, AFAICT. @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ list_add_tail(>sdom_elem, >sdom->vcpu); + +if(!timer_started) +{ +/* the first vcpu starts the timer for the first time*/ +timer_started = 1; +set_timer(_timer,svc->cur_deadline); +} } This also seems to be gone in v3, which is good. In fact, it uses timer_started, which I already said I didn't like. About the actual startup of the timer (no matter whether for first time or not). Here, you were doing it in _vcpu_insert() and not in _vcpu_wake(); in v3 you're doing it in _vcpu_wake() and not in _runq_insert()... Which one is the proper way? Correct me if I'm wrong, at the beginning of the boot process, all vcpus are put to sleep/not_runnable after insertions. Therefore, the timer should start when the first vcpu wakes up. I think the wake() in v3 should be correct. @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask) } /* - * Update vcpu's budget and - * sort runq by insert the modifed vcpu back to runq - * lock is grabbed before calling this function - */ -static void -__repl_update(const struct scheduler *ops, s_time_t now) -{ Please, allow me to say that seeing this function going away, fills my heart with pure joy!! :-D @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched } } -ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */ +ret.time = snext->budget; /* invoke the scheduler next time */ ret.task = snext->vcpu; This is ok as it is done in v3 (i.e., snext->budget if !idle, -1 if idle). @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) /* insert svc to runq/depletedq because svc is not in queue now */ __runq_insert(ops, svc); -__repl_update(ops, now); - -ASSERT(!list_empty(>sdom)); -sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); -online = cpupool_scheduler_cpumask(sdom->dom->cpupool); -snext = __runq_pick(ops, online); /* pick snext from ALL valid cpus */ - -runq_tickle(ops, snext); +runq_tickle(ops, svc); And this is another thing I especially like of this patch: it makes the wakeup path a lot simpler and a lot more similar to how it looks like in the other schedulers. Good job with this. :-) @@ -1108,15 +1078,8 @@ rt_context_saved(const struct scheduler *ops, struct vcpu *vc) if ( test_and_clear_bit(__RTDS_delayed_runq_add, >flags) && likely(vcpu_runnable(vc)) ) { +/* only insert the pre-empted vcpu back*/ __runq_insert(ops, svc); -__repl_update(ops, NOW()); - -ASSERT(!list_empty(>sdom)); -sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); -online = cpupool_scheduler_cpumask(sdom->dom->cpupool); -snext = __runq_pick(ops, online); /* pick snext from ALL cpus */ - -runq_tickle(ops, snext); } Mmm... I'll think about this more and let you know... But out of the top of my head, I think the tickling has to stay? You preempted a vcpu from the pcpu where it was running, maybe some other pcpu is either idle or running a vcpu with a later deadline, and should come and pick this one up? gEDF allows this but there is overhead and may not be worth it. I have no stats to support this but there are some papers on restricting what tasks c
[Xen-devel] [PATCH v3 0/1] xen: sched: convert RTDS from time to event driven model
Current RTDS scheduler is time driven and is called every 1ms. During each scheduler call, the repl_update() scans both runq and depeletedq, which might not be necessary every 1ms. Since each vcpu is implemented as a deferable server, budget is preserved during its period and refilled in the next. It is not necessary to check every 1ms as the current design does. The replenishment is needed at the nearest next period(nearest current_deadline) of all runnable vcpus. This improved design tries to reduce scheduler invocation by using an event driven approach;rt_schedule() will return a value when the scheduler needs to be called next time. In addition, the sched_rt will have one dedicated timer to handle replenishment when necessary. In other words, the budget replenishment and scheduler decision(rt_schedule) are separated. The transient behavior should be noted. It happens between a vcpu tickles and a pcpu actually picks it. As previous discussions, this is unavoidable. Previous discussions: http://lists.xenproject.org/archives/html/xen-devel/2015-06/msg02629.html Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> Tianyang Chen (1): xen: sched: convert RTDS from time to event driven model xen/common/sched_rt.c | 291 - 1 file changed, 213 insertions(+), 78 deletions(-) -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] [PATCH v3 1/1] xen: sched: convert RTDS from time to event driven model
Budget replenishment and enforcement are separated by adding a replenishment timer, which fires at the next most imminent release time of all runnable vcpus. A new runningq has been added to keep track of all vcpus that are on pcpus. The following functions have major changes to manage the runningq and replenishment repl_handler(): It is a timer handler which is re-programmed to fire at the nearest vcpu deadline to replenish vcpus on runq, depeletedq and runningq. When the replenishment is done, each replenished vcpu should tickle a pcpu to see if it needs to preempt any running vcpus. rt_schedule(): picks the highest runnable vcpu based on cpu affinity and ret.time will be passed to schedule(). If an idle vcpu is picked, -1 is returned to avoid busy-waiting. repl_update() has been removed. rt_vcpu_wake(): when a vcpu is awake, it tickles instead of picking one from runq. When a vcpu wakes up, it might reprogram the timer if it has a more recent release time. rt_context_saved(): when context switching is finished, the preempted vcpu will be put back into the runq. Runningq is updated and picking from runq and tickling are removed. Simplified funtional graph: schedule.c SCHEDULE_SOFTIRQ: rt_schedule(): [spin_lock] burn_budget(scurr) snext = runq_pick() [spin_unlock] sched_rt.c TIMER_SOFTIRQ replenishment_timer_handler() [spin_lock] <for_each_vcpu_on_q(i)> { replenish(i) runq_tickle(i) }> program_timer() [spin_lock] Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- xen/common/sched_rt.c | 291 - 1 file changed, 213 insertions(+), 78 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 4372486..1470e94 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -87,7 +88,7 @@ #define RTDS_DEFAULT_BUDGET (MICROSECS(4000)) #define UPDATE_LIMIT_SHIFT 10 -#define MAX_SCHEDULE(MILLISECS(1)) + /* * Flags */ @@ -147,11 +148,19 @@ static unsigned int nr_rt_ops; * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ + +/* dedicated timer for replenishment */ +static struct timer repl_timer; + +/* handler for the replenishment timer */ +static void repl_handler(void *data); + struct rt_private { spinlock_t lock;/* the global coarse grand lock */ struct list_head sdom; /* list of availalbe domains, used for dump */ struct list_head runq; /* ordered list of runnable vcpus */ struct list_head depletedq; /* unordered list of depleted vcpus */ +struct list_head runningq; /* current running vcpus */ cpumask_t tickled; /* cpus been tickled */ }; @@ -160,6 +169,7 @@ struct rt_private { */ struct rt_vcpu { struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head runningq_elem; /* on the runningq list */ struct list_head sdom_elem; /* on the domain VCPU list */ /* Up-pointers */ @@ -215,6 +225,11 @@ static inline struct list_head *rt_depletedq(const struct scheduler *ops) return _priv(ops)->depletedq; } +static inline struct list_head *rt_runningq(const struct scheduler *ops) +{ +return _priv(ops)->runningq; +} + /* * Queue helper functions for runq and depletedq */ @@ -230,6 +245,18 @@ __q_elem(struct list_head *elem) return list_entry(elem, struct rt_vcpu, q_elem); } +static int +__vcpu_on_runningq(const struct rt_vcpu *svc) +{ +return !list_empty(>runningq_elem); +} + +static struct rt_vcpu * +__runningq_elem(struct list_head *elem) +{ +return list_entry(elem, struct rt_vcpu, runningq_elem); +} + /* * Debug related code, dump vcpu/cpu information */ @@ -290,7 +317,7 @@ rt_dump_pcpu(const struct scheduler *ops, int cpu) static void rt_dump(const struct scheduler *ops) { -struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *iter; +struct list_head *iter_sdom, *iter_svc, *runq, *depletedq, *runningq, *iter; struct rt_private *prv = rt_priv(ops); struct rt_vcpu *svc; struct rt_dom *sdom; @@ -303,6 +330,7 @@ rt_dump(const struct scheduler *ops) runq = rt_runq(ops); depletedq = rt_depletedq(ops); +runningq = rt_runningq(ops); printk("Global RunQueue info:\n"); list_for_each( iter, runq ) @@ -318,6 +346,13 @@ rt_dump(const struct scheduler *ops) rt_dump_vcpu(ops, svc); } +printk("Global RunningQueue info:\n"); +list_for_each( iter, runningq ) +{ +svc = __runningq_elem(iter); +rt_dump_vcpu(ops, svc); +} + printk
Re: [Xen-devel] [PATCH v3 1/1] xen: sched: convert RTDS from time to event driven model
On 1/21/2016 11:06 PM, Tianyang Chen wrote: Budget replenishment and enforcement are separated by adding a replenishment timer, which fires at the next most imminent release time of all runnable vcpus. A new runningq has been added to keep track of all vcpus that are on pcpus. The following functions have major changes to manage the runningq and replenishment Well, I'm just gonna comment on couple things here. The major difference between this patch and the previous one is the addition of a runningq, which leads to extra code in context_save(). Since the scheduler doesn't run until the first vcpu wakes up, wake() does some of the timer stuff, in additional to the logic in the handler. It works nicely especially when a vcpu wakes up and has the earliest next release time. @@ -160,6 +169,7 @@ struct rt_private { */ struct rt_vcpu { struct list_head q_elem;/* on the runq/depletedq list */ +struct list_head runningq_elem; /* on the runningq list */ struct list_head sdom_elem; /* on the domain VCPU list */ Is it better to re-use q-elem so extra helper functions can be avoided? Maybe another flag thing that shows which q a vcpu is on? @@ -848,8 +880,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched /* burn_budget would return for IDLE VCPU */ burn_budget(ops, scurr, now); -__repl_update(ops, now); - if ( tasklet_work_scheduled ) { snext = rt_vcpu(idle_vcpu[cpu]); @@ -875,6 +905,9 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched set_bit(__RTDS_delayed_runq_add, >flags); snext->last_start = now; + +ret.time = -1; /* if an idle vcpu is picked */ + I kinda just stick this in based on previous discussion on RTDS busy-waiting. +/* The replenishment timer handler scans + * all three queues and find the most + * imminent release time. If there is no + * vcpus, timer is set in wake() + */ +static void repl_handler(void *data){ +unsigned long flags; +s_time_t now = NOW(); +s_time_t min_repl = LONG_MAX; /* max time used in comparisoni */ +struct scheduler *ops = data; +struct rt_private *prv = rt_priv(ops); +struct list_head *runq = rt_runq(ops); +struct list_head *depletedq = rt_depletedq(ops); +struct list_head *runningq = rt_runningq(ops); +struct list_head *iter; +struct list_head *tmp; +struct rt_vcpu *svc = NULL; + +stop_timer(_timer); + +spin_lock_irqsave(>lock, flags); + Here is a lock for protecting queues. Correct me if I'm wrong, since in alloc_pdata(), the private lock points to the global system lock, I just assume there is no difference here. -- Tianyang Chen ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] Questions about the use of idle_vcpu[]
On 1/18/2016 11:07 AM, Meng Xu wrote: On Mon, Jan 18, 2016 at 6:00 AM, Dario Faggioli <dario.faggi...@citrix.com> wrote: On Mon, 2016-01-18 at 10:47 +, George Dunlap wrote: On Fri, Jan 15, 2016 at 1:04 AM, Tianyang Chen <ti...@seas.upenn.edu> If an idle vcpu is picked, the ret.time is set accordingly in both credit and credit2 by checking whether snext is idle. if so, credit returns -1 and credit2 returns 2ms. However, there is no corresponding code in the RTDS scheduler to handle this. When an idle_vcpu is picked, the value of ret.time would be 0 and the scheduler would be invoked again. What is the logic behind this? No real logic, as far as I can tell. :-) The ret.time return value tells the generic scheduling code when to set the next scheduler timer. According to the comment in xen/common/schedule.c:schedule(), returning a negative value means "don't bother setting a timer" (e.g., no time limit). So credit1 does the right thing. It does. Then the RTDS is doing *incorrectly* right now. :-( George: Thanks. After looking at idle_loop() it makes sense now. Even though an idle vcpu won't tell scheduler timer when to fire next time, do_tasklet() checks if all tasklets on the list are finished and then raise SCHEDULE_SOFTIRQ. It looks like credit2's behavior will probably prevent the processor from going into deeper power-saving states, and rtds' behavior might cause it to essentially busy-wait. RTDS behavior is broken in many respect, including this, and in fact, Meng and Tianyang are sending patches already to fix it (I'll let you guys have my comments shortly :-P). Right. Tianyang and I are working on changing it from quantum driven model to event-driven (or called timer-driven) model. Tianyang sent out the first-version patch, but that version has some problems. He is working on the second version now. Hi Dario, Tianyang is working on the second version right now. If you could have a quick look at our discussion in that thread and points out the "serious" issues in the decision, that will be great! We won't repeat the error again and again in the following versions. As to the minor issues, we could refine it in the second version. (I'm just thinking about how to save your time to have this done. For the obvious things that I can handle, I will do it and avoid "wasting" you time. For the design choices that we are unclear, we definitely need your insights/commands. ;-) ) Dario: I had some discussion with Meng recently and the second version will soon come out. You can directly comment on it if that saves you some time. Thanks, Tianyang ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
On 1/15/2016 10:33 AM, Meng Xu wrote: On Wed, Jan 6, 2016 at 2:57 AM, Tianyang Chen <ti...@seas.upenn.edu> wrote: On 12/31/2015 8:44 AM, Meng Xu wrote: On Thu, Dec 31, 2015 at 6:20 PM, Tianyang Chen <ti...@seas.upenn.edu> wrote: Budget replenishment is now handled by a dedicated timer which is triggered at the most imminent release time of all runnable vcpus. Changes since V1: None Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- xen/common/sched_rt.c | 159 + 1 file changed, 95 insertions(+), 64 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 4372486..d522272 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops; * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ + +/* dedicated timer for replenishment */ +static struct timer repl_timer; + +/* controls when to first start the timer*/ missing a space between * and / at the end of this line. +static int timer_started; + +/* handler for the replenishment timer */ +static void repl_handler(void *data); + struct rt_private { spinlock_t lock;/* the global coarse grand lock */ struct list_head sdom; /* list of availalbe domains, used for dump */ @@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) static int rt_init(struct scheduler *ops) { +const int cpu = smp_processor_id(); Did the "cpu" be used? If not, please remove it here. struct rt_private *prv = xzalloc(struct rt_private); printk("Initializing RTDS scheduler\n" @@ -454,6 +466,8 @@ rt_init(struct scheduler *ops) ops->sched_data = prv; +init_timer(_timer, repl_handler, ops, 0); + return 0; no_mem: @@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops) xfree(_cpumask_scratch); _cpumask_scratch = NULL; } + +kill_timer(_timer); + xfree(prv); } @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ list_add_tail(>sdom_elem, >sdom->vcpu); + +if(!timer_started) space should be added.. Thanks, Meng. I will fix that. +{ +/* the first vcpu starts the timer for the first time*/ +timer_started = 1; +set_timer(_timer,svc->cur_deadline); +} } /* @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask) } /* - * Update vcpu's budget and - * sort runq by insert the modifed vcpu back to runq - * lock is grabbed before calling this function - */ -static void -__repl_update(const struct scheduler *ops, s_time_t now) -{ -struct list_head *runq = rt_runq(ops); -struct list_head *depletedq = rt_depletedq(ops); -struct list_head *iter; -struct list_head *tmp; -struct rt_vcpu *svc = NULL; - -list_for_each_safe(iter, tmp, runq) -{ -svc = __q_elem(iter); -if ( now < svc->cur_deadline ) -break; - -rt_update_deadline(now, svc); -/* reinsert the vcpu if its deadline is updated */ -__q_remove(svc); -__runq_insert(ops, svc); -} - -list_for_each_safe(iter, tmp, depletedq) -{ -svc = __q_elem(iter); -if ( now >= svc->cur_deadline ) -{ -rt_update_deadline(now, svc); -__q_remove(svc); /* remove from depleted queue */ -__runq_insert(ops, svc); /* add to runq */ -} -} -} - -/* * schedule function for rt scheduler. * The lock is already grabbed in schedule.c, no need to lock here */ @@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched /* burn_budget would return for IDLE VCPU */ burn_budget(ops, scurr, now); -__repl_update(ops, now); if ( tasklet_work_scheduled ) { @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched } } -ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */ +ret.time = snext->budget; /* invoke the scheduler next time */ ret.task = snext->vcpu; /* TRACE */ @@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { struct rt_vcpu * const svc = rt_vcpu(vc); s_time_t now = NOW(); -struct rt_private *prv = rt_priv(ops); -struct rt_vcpu *snext = NULL; /* highest priority on RunQ */ -
[Xen-devel] Questions about the use of idle_vcpu[]
Hello, I'm confused by the use of idle_vcpu[] in Xen schedulers. In credit, credit2 and rtds schedulers, they are used in conjunction with variable "tasklet_work_scheduled" like this: if ( tasklet_work_scheduled ) { snext = #_VCPU(idle_vcpu[cpu]); } The idle_vcpu array is initialized in cpu_schedule_up() and indeed they are set in corresponding alloc_vdata() to have IDLE_CREDIT and 0 weight or DEFAULT_PERIOD and 0 budget. The value of tasklet_work_scheduled comes from scheduler.c interface. According to the comments in sched_credit2.c, if there's tasklet work to do, we want to chose the idle vcpu for this processor, and mark the current for delayed runqueue add. Can someone elaborate the reason why an idle vcpu should be picked? What does an idle_vcpu do to a tasklet? I feel like I'm missing something here. If an idle vcpu is picked, the ret.time is set accordingly in both credit and credit2 by checking whether snext is idle. if so, credit returns -1 and credit2 returns 2ms. However, there is no corresponding code in the RTDS scheduler to handle this. When an idle_vcpu is picked, the value of ret.time would be 0 and the scheduler would be invoked again. What is the logic behind this? I'd appreciate it if anyone could point me to the right direction. Thanks, Tianyang Chen ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
Re: [Xen-devel] [PATCH 1/1] Improved RTDS scheduler
On 1/4/2016 9:36 AM, Dario Faggioli wrote: On Mon, 2016-01-04 at 21:48 +0800, Meng Xu wrote: On Mon, Jan 4, 2016 at 7:42 PM, Ian Campbell <ian.campb...@citrix.com wrote: On Thu, 2015-12-31 at 04:45 -0500, Tianyang Chen wrote: Budget replenishment is now handled by a dedicated timer which is triggered at the most imminent release time of all runnable vcpus. Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- -cover-letter.patch| 16 +++ 0001-Improved-RTDS-scheduler.patch | 280 bak| 62 I assume you didn't actually intend to commit those ;-) No. This version was sent by mistake. Please ignore this version. :-( Tianyang sent another version (v2) with the subject "[PATCH V2 1/1] Improved RTDS scheduler". Indeed. BTW, I'm back today from Winter Holidays. I'll give a look at this series as soon as practical. I'll comment directly on V2, with the only exception of this very email. So, first of all, Cc list (which is the same here and in V2): how have the names of the people that are in there been picked? Dario: I picked the names based on previous discussion threads between you, Meng and Dagaen. Now I have changed who will be cc'ed so they won't be bothered. The patch is only about xen/common/sched_rt.c. As per MAINTAINERS, the maintainer of the code hosted in that file is just me. Surely it's a good thing to also include George, as he's the other maintainer of scheduling in general. All the other people, though, should not be bothered by being copied directly... Especially IanC and IanJ, I'd say. (In general single patches do not require a separate cover letter unless the required context contains a large amount of information which is not appropriate for the commit message of the actual change, I'll leave it to you and the scheduler maintainers to decide how much of your cover letter it would be appropriate to move to the commit message). The cover letter is supposed to explain the design idea of the improved RTDS scheduler so that reviewers could (potentially) save time in reviewing the patch, we think. :-) Probably, the commit message should be refined and self-contained? That's true. In particular, the "context", defined as summary of what happened and have been discussed on the mailing list before, during previous submissions, etc., definitely belongs to a cover letter. There are pages of discussions on the current design and we figured we should make a cover page to summarize it and put more details in. High level design ideas and concepts, also do, e.g., in order to avoid ending up with commit changelogs that are all several pages long! :-) All that being said, we certainly want patches' subject lines and changelogs to be descriptive of at least what is being done, and why. If we commit this as it is right now, here it is what we'll find in `git log': Improved RTDS scheduler Budget replenishment is now handled by a dedicated timer which is triggered at the most imminent release time of all runnable vcpus. Which won't, I'm quite sure about it, be that much useful when looking at it, say, in 3 years! :-D I know, finding the proper balance of what's better put where is not always super easy, especially at the beginning... but that's part of the process of cooking a good patch (series) :-) A few suggestions: - we want tags in the subject. E.g., in this case, something like "xen: sched: Improved RTDS scheduler" - avoid (in both subject and changelog) generic terms like "improve" "fix", etc., unless you can explain more specifically to what they refer... I mean, pretty much all commits that touch sched_rt.c can have "Improve RTDS" as a subject, can't they? (Unless we decide to commit something making things deliberately worse!! :-)) Good call. In this case, e.g. (although, perhaps a bit long): "xen: sched: convert RTDS from time to event driven model" - as Ian said, there are information in the cover letter that can well be moved in the changelog, making it a lot more useful, and not less refined or self contained. Here is what I propose, please let me know your thoughts: Cover letter: 1. How this patch could improve RTDS 2. Summary of previous discussion Current RTDS scheduler is time driven and is called every 1ms. During each scheduler call, the repl_update() scans both runq and depeletedq, which might not be necessary every 1ms. Since each vcpu is implemented as a deferable server, budget is preserved during its period and refilled in the next. It is not necessary to check every 1ms as the current design does. The replenishment is needed at the nearest next period(nearest current_deadline)
Re: [Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
On 12/31/2015 8:44 AM, Meng Xu wrote: On Thu, Dec 31, 2015 at 6:20 PM, Tianyang Chen <ti...@seas.upenn.edu> wrote: Budget replenishment is now handled by a dedicated timer which is triggered at the most imminent release time of all runnable vcpus. Changes since V1: None Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- xen/common/sched_rt.c | 159 + 1 file changed, 95 insertions(+), 64 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 4372486..d522272 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops; * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ + +/* dedicated timer for replenishment */ +static struct timer repl_timer; + +/* controls when to first start the timer*/ missing a space between * and / at the end of this line. +static int timer_started; + +/* handler for the replenishment timer */ +static void repl_handler(void *data); + struct rt_private { spinlock_t lock;/* the global coarse grand lock */ struct list_head sdom; /* list of availalbe domains, used for dump */ @@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) static int rt_init(struct scheduler *ops) { +const int cpu = smp_processor_id(); Did the "cpu" be used? If not, please remove it here. struct rt_private *prv = xzalloc(struct rt_private); printk("Initializing RTDS scheduler\n" @@ -454,6 +466,8 @@ rt_init(struct scheduler *ops) ops->sched_data = prv; +init_timer(_timer, repl_handler, ops, 0); + return 0; no_mem: @@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops) xfree(_cpumask_scratch); _cpumask_scratch = NULL; } + +kill_timer(_timer); + xfree(prv); } @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ list_add_tail(>sdom_elem, >sdom->vcpu); + +if(!timer_started) space should be added.. Thanks, Meng. I will fix that. +{ +/* the first vcpu starts the timer for the first time*/ +timer_started = 1; +set_timer(_timer,svc->cur_deadline); +} } /* @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask) } /* - * Update vcpu's budget and - * sort runq by insert the modifed vcpu back to runq - * lock is grabbed before calling this function - */ -static void -__repl_update(const struct scheduler *ops, s_time_t now) -{ -struct list_head *runq = rt_runq(ops); -struct list_head *depletedq = rt_depletedq(ops); -struct list_head *iter; -struct list_head *tmp; -struct rt_vcpu *svc = NULL; - -list_for_each_safe(iter, tmp, runq) -{ -svc = __q_elem(iter); -if ( now < svc->cur_deadline ) -break; - -rt_update_deadline(now, svc); -/* reinsert the vcpu if its deadline is updated */ -__q_remove(svc); -__runq_insert(ops, svc); -} - -list_for_each_safe(iter, tmp, depletedq) -{ -svc = __q_elem(iter); -if ( now >= svc->cur_deadline ) -{ -rt_update_deadline(now, svc); -__q_remove(svc); /* remove from depleted queue */ -__runq_insert(ops, svc); /* add to runq */ -} -} -} - -/* * schedule function for rt scheduler. * The lock is already grabbed in schedule.c, no need to lock here */ @@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched /* burn_budget would return for IDLE VCPU */ burn_budget(ops, scurr, now); -__repl_update(ops, now); if ( tasklet_work_scheduled ) { @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched } } -ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */ +ret.time = snext->budget; /* invoke the scheduler next time */ ret.task = snext->vcpu; /* TRACE */ @@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { struct rt_vcpu * const svc = rt_vcpu(vc); s_time_t now = NOW(); -struct rt_private *prv = rt_priv(ops); -struct rt_vcpu *snext = NULL; /* highest priority on RunQ */ -struct rt_dom *sdom = NULL; -cpumask_t *online; BUG_ON( is_idle_vcpu(vc) ); @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) /* ins
[Xen-devel] [PATCH 1/1] Improved RTDS scheduler
Budget replenishment is now handled by a dedicated timer which is triggered at the most imminent release time of all runnable vcpus. Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- -cover-letter.patch| 16 +++ 0001-Improved-RTDS-scheduler.patch | 280 bak| 62 xen/common/sched_rt.c | 159 +++- 4 files changed, 453 insertions(+), 64 deletions(-) create mode 100644 -cover-letter.patch create mode 100644 0001-Improved-RTDS-scheduler.patch create mode 100644 bak diff --git a/-cover-letter.patch b/-cover-letter.patch new file mode 100644 index 000..f5aca91 --- /dev/null +++ b/-cover-letter.patch @@ -0,0 +1,16 @@ +From 25ca27ea281885eb9873244a11f08e6987efb36e Mon Sep 17 00:00:00 2001 +From: Tianyang Chen <ti...@seas.upenn.edu> +Date: Thu, 31 Dec 2015 04:05:21 -0500 +Subject: [PATCH] *** SUBJECT HERE *** + +*** BLURB HERE *** + +Tianyang Chen (1): + Improved RTDS scheduler + + xen/common/sched_rt.c | 159 + + 1 file changed, 95 insertions(+), 64 deletions(-) + +-- +1.7.9.5 + diff --git a/0001-Improved-RTDS-scheduler.patch b/0001-Improved-RTDS-scheduler.patch new file mode 100644 index 000..02cb1f7 --- /dev/null +++ b/0001-Improved-RTDS-scheduler.patch @@ -0,0 +1,280 @@ +From 25ca27ea281885eb9873244a11f08e6987efb36e Mon Sep 17 00:00:00 2001 +From: Tianyang Chen <tianyangp...@gmail.com> +Date: Thu, 31 Dec 2015 01:55:19 -0500 +Subject: [PATCH] Improved RTDS scheduler + +Budget replenishment is now handled by a dedicated timer which is +triggered at the most imminent release time of all runnable vcpus. + +Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> +Signed-off-by: Meng Xu <men...@cis.upenn.edu> +Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> +--- + xen/common/sched_rt.c | 159 + + 1 file changed, 95 insertions(+), 64 deletions(-) + +diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c +index 4372486..d522272 100644 +--- a/xen/common/sched_rt.c b/xen/common/sched_rt.c +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -147,6 +148,16 @@ static unsigned int nr_rt_ops; + * Global lock is referenced by schedule_data.schedule_lock from all + * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() + */ ++ ++/* dedicated timer for replenishment */ ++static struct timer repl_timer; ++ ++/* controls when to first start the timer*/ ++static int timer_started; ++ ++/* handler for the replenishment timer */ ++static void repl_handler(void *data); ++ + struct rt_private { + spinlock_t lock;/* the global coarse grand lock */ + struct list_head sdom; /* list of availalbe domains, used for dump */ +@@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) + static int + rt_init(struct scheduler *ops) + { ++const int cpu = smp_processor_id(); + struct rt_private *prv = xzalloc(struct rt_private); + + printk("Initializing RTDS scheduler\n" +@@ -454,6 +466,8 @@ rt_init(struct scheduler *ops) + + ops->sched_data = prv; + ++init_timer(_timer, repl_handler, ops, 0); ++ + return 0; + + no_mem: +@@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops) + xfree(_cpumask_scratch); + _cpumask_scratch = NULL; + } ++ ++kill_timer(_timer); ++ + xfree(prv); + } + +@@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) + + /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ + list_add_tail(>sdom_elem, >sdom->vcpu); ++ ++if(!timer_started) ++{ ++/* the first vcpu starts the timer for the first time*/ ++timer_started = 1; ++set_timer(_timer,svc->cur_deadline); ++} + } + + /* +@@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask) + } + + /* +- * Update vcpu's budget and +- * sort runq by insert the modifed vcpu back to runq +- * lock is grabbed before calling this function +- */ +-static void +-__repl_update(const struct scheduler *ops, s_time_t now) +-{ +-struct list_head *runq = rt_runq(ops); +-struct list_head *depletedq = rt_depletedq(ops); +-struct list_head *iter; +-struct list_head *tmp; +-struct rt_vcpu *svc = NULL; +- +-list_for_each_safe(iter, tmp, runq) +-{ +-svc = __q_elem(iter); +-if ( now < svc->cur_deadline ) +-break; +- +-rt_update_deadline(now, svc); +-/* reinsert the vcpu if its deadline is updated */ +-__q_remove(svc); +-__runq_insert(ops, svc); +-} +- +-list_for_each_safe(i
[Xen-devel] [PATCH V2 1/1] Improved RTDS scheduler
Budget replenishment is now handled by a dedicated timer which is triggered at the most imminent release time of all runnable vcpus. Changes since V1: None Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> --- xen/common/sched_rt.c | 159 + 1 file changed, 95 insertions(+), 64 deletions(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 4372486..d522272 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -147,6 +148,16 @@ static unsigned int nr_rt_ops; * Global lock is referenced by schedule_data.schedule_lock from all * physical cpus. It can be grabbed via vcpu_schedule_lock_irq() */ + +/* dedicated timer for replenishment */ +static struct timer repl_timer; + +/* controls when to first start the timer*/ +static int timer_started; + +/* handler for the replenishment timer */ +static void repl_handler(void *data); + struct rt_private { spinlock_t lock;/* the global coarse grand lock */ struct list_head sdom; /* list of availalbe domains, used for dump */ @@ -426,6 +437,7 @@ __runq_insert(const struct scheduler *ops, struct rt_vcpu *svc) static int rt_init(struct scheduler *ops) { +const int cpu = smp_processor_id(); struct rt_private *prv = xzalloc(struct rt_private); printk("Initializing RTDS scheduler\n" @@ -454,6 +466,8 @@ rt_init(struct scheduler *ops) ops->sched_data = prv; +init_timer(_timer, repl_handler, ops, 0); + return 0; no_mem: @@ -473,6 +487,9 @@ rt_deinit(const struct scheduler *ops) xfree(_cpumask_scratch); _cpumask_scratch = NULL; } + +kill_timer(_timer); + xfree(prv); } @@ -635,6 +652,13 @@ rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) /* add rt_vcpu svc to scheduler-specific vcpu list of the dom */ list_add_tail(>sdom_elem, >sdom->vcpu); + +if(!timer_started) +{ +/* the first vcpu starts the timer for the first time*/ +timer_started = 1; +set_timer(_timer,svc->cur_deadline); +} } /* @@ -792,44 +816,6 @@ __runq_pick(const struct scheduler *ops, const cpumask_t *mask) } /* - * Update vcpu's budget and - * sort runq by insert the modifed vcpu back to runq - * lock is grabbed before calling this function - */ -static void -__repl_update(const struct scheduler *ops, s_time_t now) -{ -struct list_head *runq = rt_runq(ops); -struct list_head *depletedq = rt_depletedq(ops); -struct list_head *iter; -struct list_head *tmp; -struct rt_vcpu *svc = NULL; - -list_for_each_safe(iter, tmp, runq) -{ -svc = __q_elem(iter); -if ( now < svc->cur_deadline ) -break; - -rt_update_deadline(now, svc); -/* reinsert the vcpu if its deadline is updated */ -__q_remove(svc); -__runq_insert(ops, svc); -} - -list_for_each_safe(iter, tmp, depletedq) -{ -svc = __q_elem(iter); -if ( now >= svc->cur_deadline ) -{ -rt_update_deadline(now, svc); -__q_remove(svc); /* remove from depleted queue */ -__runq_insert(ops, svc); /* add to runq */ -} -} -} - -/* * schedule function for rt scheduler. * The lock is already grabbed in schedule.c, no need to lock here */ @@ -848,7 +834,6 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched /* burn_budget would return for IDLE VCPU */ burn_budget(ops, scurr, now); -__repl_update(ops, now); if ( tasklet_work_scheduled ) { @@ -889,7 +874,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched } } -ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */ +ret.time = snext->budget; /* invoke the scheduler next time */ ret.task = snext->vcpu; /* TRACE */ @@ -1033,10 +1018,6 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { struct rt_vcpu * const svc = rt_vcpu(vc); s_time_t now = NOW(); -struct rt_private *prv = rt_priv(ops); -struct rt_vcpu *snext = NULL; /* highest priority on RunQ */ -struct rt_dom *sdom = NULL; -cpumask_t *online; BUG_ON( is_idle_vcpu(vc) ); @@ -1074,14 +1055,7 @@ rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) /* insert svc to runq/depletedq because svc is not in queue now */ __runq_insert(ops, svc); -__repl_update(ops, now); - -ASSERT(!list_empty(>sdom)); -sdom = list_entry(prv->sdom.next, struct rt_dom, sdom_elem); -online = cpupool_scheduler_cpumask(sdom->dom->cpupool); -snext = __runq_pick(ops, online); /* pick snext from
[Xen-devel] [PATCH 0/1] Improved RTDS scheduler
Current RTDS scheduler is time driven and is called every 1ms. During each scheduler call, the repl_update() scans both runq and depeletedq, which might not be necessary every 1ms. Since each vcpu is implemented as a deferable server, budget is preserved during its period and refilled in the next. It is not necessary to check every 1ms as the current design does. The replenishment is needed at the nearest next period(nearest current_deadline) of all runnable vcpus. This improved design tries to reduce scheduler invocation by using an event driven approach;rt_schedule() will return a value when the scheduler needs to be called next time. In addition, the sched_rt will have one dedicated timer to handle replenishment when necessary. In other words, the budget replenishment and scheduler decision(rt_schedule) are separated. Based on previous decision between Dario, Dagaen and Meng, the improved design can be implemented/modified as follows: rt_schedule(): picks the highest runnable vcpu based on cpu affinity and ret.time will be passed to schedule(). rt_vcpu_wake(): when a vcpu is awake, it tickles instead of picking one from runq. rt_context_saved(): when context switching is finished, the preempted vcpu will be put back into the runq. Picking from runq and tickling are removed. repl_handler(): a timer handler which is reprogrammed to fire at the nearest vcpu deadline to replenish vcpus on depeletedq while keeping the runq sorted. When the replenishment is done, each replenished vcpu in the runq should tickle a pcpu to see if it needs to preempt any running vcpus. An extra field to record the last replenishing time will be added. schedule.c SCHEDULE_SOFTIRQ: rt_schedule(): [spin_lock] burn_budget(scurr) snext = runq_pick() [spin_unlock] sched_rt.c TIMER_SOFTIRQ replenishment_timer_handler() [spin_lock] <for_each_depleted_vcpu(i, i.repl_time < NOW()) { replenish(i) runq_tickle(i) }> program_timer() [spin_lock] The transient behavior should be noted. It happens between a vcpu tickles and a pcpu actually picks it. As previous discussions, this is unavoidable. Previous discussions: http://lists.xenproject.org/archives/html/xen-devel/2015-06/msg02629.html Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> *** BLURB HERE *** Tianyang Chen (1): Improved RTDS scheduler -cover-letter.patch| 16 +++ 0001-Improved-RTDS-scheduler.patch | 280 bak| 62 xen/common/sched_rt.c | 159 +++- 4 files changed, 453 insertions(+), 64 deletions(-) create mode 100644 -cover-letter.patch create mode 100644 0001-Improved-RTDS-scheduler.patch create mode 100644 bak -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] [PATCH V2 0/1] Improved RTDS scheduler
Changes since V1: Removed redundant cover letter Removed redundant patch file that was added with the last commit Please dis-regard V1 because it was in wrong format. Sorry about that. Current RTDS scheduler is time driven and is called every 1ms. During each scheduler call, the repl_update() scans both runq and depeletedq, which might not be necessary every 1ms. Since each vcpu is implemented as a deferable server, budget is preserved during its period and refilled in the next. It is not necessary to check every 1ms as the current design does. The replenishment is needed at the nearest next period(nearest current_deadline) of all runnable vcpus. This improved design tries to reduce scheduler invocation by using an event driven approach;rt_schedule() will return a value when the scheduler needs to be called next time. In addition, the sched_rt will have one dedicated timer to handle replenishment when necessary. In other words, the budget replenishment and scheduler decision(rt_schedule) are separated. Based on previous decision between Dario, Dagaen and Meng, the improved design can be implemented/modified as follows: rt_schedule(): picks the highest runnable vcpu based on cpu affinity and ret.time will be passed to schedule(). rt_vcpu_wake(): when a vcpu is awake, it tickles instead of picking one from runq. rt_context_saved(): when context switching is finished, the preempted vcpu will be put back into the runq. Picking from runq and tickling are removed. repl_handler(): a timer handler which is reprogrammed to fire at the nearest vcpu deadline to replenish vcpus on runq and depeletedq. When the replenishment is done, each replenished vcpu in the runq should tickle a pcpu to see if it needs to preempt any running vcpus. schedule.c SCHEDULE_SOFTIRQ: rt_schedule(): [spin_lock] burn_budget(scurr) snext = runq_pick() [spin_unlock] sched_rt.c TIMER_SOFTIRQ replenishment_timer_handler() [spin_lock] <for_each_depleted_vcpu(i, i.repl_time < NOW()) { replenish(i) runq_tickle(i) }> program_timer() [spin_lock] The transient behavior should be noted. It happens between a vcpu tickles and a pcpu actually picks it. As previous discussions, this is unavoidable. Previous discussions: http://lists.xenproject.org/archives/html/xen-devel/2015-06/msg02629.html Signed-off-by: Tianyang Chen <ti...@seas.upenn.edu> Signed-off-by: Meng Xu <men...@cis.upenn.edu> Signed-off-by: Dagaen Golomb <dgol...@seas.upenn.edu> Tianyang Chen (1): Improved RTDS scheduler xen/common/sched_rt.c | 159 + 1 file changed, 95 insertions(+), 64 deletions(-) -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel
[Xen-devel] [PATCH] sched_rt: Improved nested virtualization performance
In nested virtualization, choosing the smaller value for the time slice between the MAX_SCHEDULE and the budget will cause high host CPU usage. Signed-off-by: Tianyang Chen <tianyangp...@gmail.com> --- xen/common/sched_rt.c |2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xen/common/sched_rt.c b/xen/common/sched_rt.c index 4372486..9da3f35 100644 --- a/xen/common/sched_rt.c +++ b/xen/common/sched_rt.c @@ -889,7 +889,7 @@ rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_sched } } -ret.time = MIN(snext->budget, MAX_SCHEDULE); /* sched quantum */ +ret.time = MAX_SCHEDULE; /*should be used in nested virtualization*/ ret.task = snext->vcpu; /* TRACE */ -- 1.7.9.5 ___ Xen-devel mailing list Xen-devel@lists.xen.org http://lists.xen.org/xen-devel