Module: xenomai-3
Branch: wip/prioceil
Commit: 1d7ebd9240031e3214eb55ca0d20b999f39d43bc
URL:    
http://git.xenomai.org/?p=xenomai-3.git;a=commit;h=1d7ebd9240031e3214eb55ca0d20b999f39d43bc

Author: Philippe Gerum <r...@xenomai.org>
Date:   Tue Feb 16 10:12:55 2016 +0100

cobalt/synch: add support for priority ceiling protocol

---

 include/cobalt/kernel/heap.h               |   11 +
 include/cobalt/kernel/sched-idle.h         |   12 +-
 include/cobalt/kernel/sched-rt.h           |   55 ++-
 include/cobalt/kernel/sched.h              |   94 +++-
 include/cobalt/kernel/synch.h              |   51 ++-
 include/cobalt/kernel/thread.h             |   54 ++-
 include/cobalt/uapi/asm-generic/features.h |   27 +-
 include/cobalt/uapi/kernel/synch.h         |    8 +-
 include/cobalt/uapi/kernel/thread.h        |    3 +-
 include/cobalt/uapi/kernel/types.h         |    5 +-
 include/cobalt/uapi/mutex.h                |    1 +
 include/cobalt/uapi/thread.h               |    1 +
 kernel/cobalt/posix/cond.c                 |    2 +-
 kernel/cobalt/posix/memory.h               |    6 +
 kernel/cobalt/posix/monitor.c              |   29 +-
 kernel/cobalt/posix/mqueue.c               |    4 +-
 kernel/cobalt/posix/mutex.c                |  103 +++--
 kernel/cobalt/posix/process.c              |   21 +-
 kernel/cobalt/posix/process.h              |    3 +-
 kernel/cobalt/posix/syscall.c              |    6 +-
 kernel/cobalt/posix/timerfd.c              |    2 +-
 kernel/cobalt/rtdm/drvlib.c                |    2 +-
 kernel/cobalt/sched-idle.c                 |   12 +-
 kernel/cobalt/sched-quota.c                |   24 +-
 kernel/cobalt/sched-rt.c                   |   14 +-
 kernel/cobalt/sched-sporadic.c             |   35 +-
 kernel/cobalt/sched-tp.c                   |   42 +-
 kernel/cobalt/sched-weak.c                 |   16 +-
 kernel/cobalt/sched.c                      |  126 +++++-
 kernel/cobalt/synch.c                      |  636 +++++++++++++++++++---------
 kernel/cobalt/thread.c                     |  115 ++---
 31 files changed, 1038 insertions(+), 482 deletions(-)

diff --git a/include/cobalt/kernel/heap.h b/include/cobalt/kernel/heap.h
index a1cea69..d89f25d 100644
--- a/include/cobalt/kernel/heap.h
+++ b/include/cobalt/kernel/heap.h
@@ -147,6 +147,17 @@ void xnheap_free(struct xnheap *heap, void *block);
 
 int xnheap_check_block(struct xnheap *heap, void *block);
 
+static inline void *xnheap_zalloc(struct xnheap *heap, u32 size)
+{
+       void *p;
+
+       p = xnheap_alloc(heap, size);
+       if (p)
+               memset(p, 0, size);
+
+       return p;
+}
+
 static inline char *xnstrdup(const char *s)
 {
        char *p;
diff --git a/include/cobalt/kernel/sched-idle.h 
b/include/cobalt/kernel/sched-idle.h
index 732ff84..75efdec 100644
--- a/include/cobalt/kernel/sched-idle.h
+++ b/include/cobalt/kernel/sched-idle.h
@@ -33,11 +33,11 @@
 
 extern struct xnsched_class xnsched_class_idle;
 
-static inline void __xnsched_idle_setparam(struct xnthread *thread,
+static inline bool __xnsched_idle_setparam(struct xnthread *thread,
                                           const union xnsched_policy_param *p)
 {
        xnthread_clear_state(thread, XNWEAK);
-       thread->cprio = p->idle.prio;
+       return xnsched_set_effective_priority(thread, p->idle.prio);
 }
 
 static inline void __xnsched_idle_getparam(struct xnthread *thread,
@@ -50,11 +50,17 @@ static inline void __xnsched_idle_trackprio(struct xnthread 
*thread,
                                            const union xnsched_policy_param *p)
 {
        if (p)
-               __xnsched_idle_setparam(thread, p);
+               /* Inheriting a priority-less class makes no sense. */
+               XENO_WARN_ON_ONCE(COBALT, 1);
        else
                thread->cprio = XNSCHED_IDLE_PRIO;
 }
 
+static inline void __xnsched_idle_protectprio(struct xnthread *thread, int 
prio)
+{
+       XENO_WARN_ON_ONCE(COBALT, 1);
+}
+
 static inline int xnsched_idle_init_thread(struct xnthread *thread)
 {
        return 0;
diff --git a/include/cobalt/kernel/sched-rt.h b/include/cobalt/kernel/sched-rt.h
index ffb7223..992a5ba 100644
--- a/include/cobalt/kernel/sched-rt.h
+++ b/include/cobalt/kernel/sched-rt.h
@@ -67,20 +67,33 @@ static inline void __xnsched_rt_dequeue(struct xnthread 
*thread)
        xnsched_delq(&thread->sched->rt.runnable, thread);
 }
 
-static inline void __xnsched_rt_setparam(struct xnthread *thread,
-                                        const union xnsched_policy_param *p)
+static inline void __xnsched_rt_track_weakness(struct xnthread *thread)
 {
-       thread->cprio = p->rt.prio;
-       if (!xnthread_test_state(thread, XNBOOST)) {
-#ifdef CONFIG_XENO_OPT_SCHED_WEAK
+       /*
+        * We have to track threads exiting weak scheduling, i.e. any
+        * thread leaving the WEAK class code if compiled in, or
+        * assigned a zero priority if weak threads are hosted by the
+        * RT class.
+        *
+        * CAUTION: since we need to check the effective priority
+        * level for determining the weakness state, this can only
+        * apply to non-boosted threads.
+        */
+       if (IS_ENABLED(CONFIG_XENO_OPT_SCHED_WEAK) || thread->cprio)
                xnthread_clear_state(thread, XNWEAK);
-#else
-               if (thread->cprio)
-                       xnthread_clear_state(thread, XNWEAK);
-               else
-                       xnthread_set_state(thread, XNWEAK);
-#endif
-       }
+       else
+               xnthread_set_state(thread, XNWEAK);
+}
+
+static inline bool __xnsched_rt_setparam(struct xnthread *thread,
+                                        const union xnsched_policy_param *p)
+{
+       bool ret = xnsched_set_effective_priority(thread, p->rt.prio);
+       
+       if (!xnthread_test_state(thread, XNBOOST))
+               __xnsched_rt_track_weakness(thread);
+
+       return ret;
 }
 
 static inline void __xnsched_rt_getparam(struct xnthread *thread,
@@ -93,9 +106,23 @@ static inline void __xnsched_rt_trackprio(struct xnthread 
*thread,
                                          const union xnsched_policy_param *p)
 {
        if (p)
-               __xnsched_rt_setparam(thread, p);
-       else
+               thread->cprio = p->rt.prio; /* Force update. */
+       else {
                thread->cprio = thread->bprio;
+               /* Leaving PI/PP, so non-boosted by definition. */
+               __xnsched_rt_track_weakness(thread);
+       }
+}
+
+static inline void __xnsched_rt_protectprio(struct xnthread *thread, int prio)
+{
+       /*
+        * The RT class supports the widest priority range from
+        * XNSCHED_CORE_MIN_PRIO to XNSCHED_CORE_MAX_PRIO inclusive,
+        * no need to cap the input value which is guaranteed to be in
+        * the range [1..XNSCHED_CORE_MAX_PRIO].
+        */
+       thread->cprio = prio;
 }
 
 static inline void __xnsched_rt_forget(struct xnthread *thread)
diff --git a/include/cobalt/kernel/sched.h b/include/cobalt/kernel/sched.h
index fe1df7d..daa7917 100644
--- a/include/cobalt/kernel/sched.h
+++ b/include/cobalt/kernel/sched.h
@@ -139,12 +139,38 @@ struct xnsched_class {
                             const union xnsched_policy_param *p);
        void (*sched_migrate)(struct xnthread *thread,
                              struct xnsched *sched);
-       void (*sched_setparam)(struct xnthread *thread,
+       /**
+        * Set base scheduling parameters. This routine is indirectly
+        * called upon a change of base scheduling settings through
+        * __xnthread_set_schedparam() -> xnsched_set_policy(),
+        * exclusively.
+        *
+        * The scheduling class implementation should do the necessary
+        * housekeeping to comply with the new settings.
+        * thread->base_class is up to date before the call is made,
+        * and should be considered for the new weighted priority
+        * calculation. On the contrary, thread->sched_class should
+        * NOT be referred to by this handler.
+        *
+        * sched_setparam() is NEVER involved in PI or PP
+        * management. However it must deny a priority update if it
+        * contradicts an ongoing boost for @a thread. This is
+        * typically what the xnsched_set_effective_priority() helper
+        * does for such handler.
+        *
+        * @param thread Affected thread.
+        * @param p New base policy settings.
+        *
+        * @return True if the effective priority was updated
+        * (thread->cprio).
+        */
+       bool (*sched_setparam)(struct xnthread *thread,
                               const union xnsched_policy_param *p);
        void (*sched_getparam)(struct xnthread *thread,
                               union xnsched_policy_param *p);
        void (*sched_trackprio)(struct xnthread *thread,
                                const union xnsched_policy_param *p);
+       void (*sched_protectprio)(struct xnthread *thread, int prio);
        int (*sched_declare)(struct xnthread *thread,
                             const union xnsched_policy_param *p);
        void (*sched_forget)(struct xnthread *thread);
@@ -371,6 +397,9 @@ static inline void xnsched_reset_watchdog(struct xnsched 
*sched)
 }
 #endif /* CONFIG_XENO_OPT_WATCHDOG */
 
+bool xnsched_set_effective_priority(struct xnthread *thread,
+                                   int prio);
+
 #include <cobalt/kernel/sched-idle.h>
 #include <cobalt/kernel/sched-rt.h>
 
@@ -395,6 +424,9 @@ int xnsched_set_policy(struct xnthread *thread,
 void xnsched_track_policy(struct xnthread *thread,
                          struct xnthread *target);
 
+void xnsched_protect_priority(struct xnthread *thread,
+                             int prio);
+
 void xnsched_migrate(struct xnthread *thread,
                     struct xnsched *sched);
 
@@ -473,7 +505,7 @@ static inline void xnsched_tick(struct xnsched *sched)
        /*
         * A thread that undergoes round-robin scheduling only
         * consumes its time slice when it runs within its own
-        * scheduling class, which excludes temporary PIP boosts, and
+        * scheduling class, which excludes temporary PI boosts, and
         * does not hold the scheduler lock.
         */
        if (sched_class == curr->base_class &&
@@ -500,6 +532,12 @@ static inline int xnsched_declare(struct xnsched_class 
*sched_class,
        return 0;
 }
 
+static inline int xnsched_calc_wprio(struct xnsched_class *sched_class,
+                                    int prio)
+{
+       return prio + sched_class->weight;
+}
+
 #ifdef CONFIG_XENO_OPT_SCHED_CLASSES
 
 static inline void xnsched_enqueue(struct xnthread *thread)
@@ -526,11 +564,11 @@ static inline void xnsched_requeue(struct xnthread 
*thread)
                sched_class->sched_requeue(thread);
 }
 
-static inline void xnsched_setparam(struct xnthread *thread,
-                                   const union xnsched_policy_param *p)
+static inline
+bool xnsched_setparam(struct xnthread *thread,
+                     const union xnsched_policy_param *p)
 {
-       thread->sched_class->sched_setparam(thread, p);
-       thread->wprio = thread->cprio + thread->sched_class->weight;
+       return thread->base_class->sched_setparam(thread, p);
 }
 
 static inline void xnsched_getparam(struct xnthread *thread,
@@ -543,7 +581,13 @@ static inline void xnsched_trackprio(struct xnthread 
*thread,
                                     const union xnsched_policy_param *p)
 {
        thread->sched_class->sched_trackprio(thread, p);
-       thread->wprio = thread->cprio + thread->sched_class->weight;
+       thread->wprio = xnsched_calc_wprio(thread->sched_class, thread->cprio);
+}
+
+static inline void xnsched_protectprio(struct xnthread *thread, int prio)
+{
+       thread->sched_class->sched_protectprio(thread, prio);
+       thread->wprio = xnsched_calc_wprio(thread->sched_class, thread->cprio);
 }
 
 static inline void xnsched_forget(struct xnthread *thread)
@@ -599,17 +643,15 @@ static inline void xnsched_requeue(struct xnthread 
*thread)
                __xnsched_rt_requeue(thread);
 }
 
-static inline void xnsched_setparam(struct xnthread *thread,
+static inline bool xnsched_setparam(struct xnthread *thread,
                                    const union xnsched_policy_param *p)
 {
-       struct xnsched_class *sched_class = thread->sched_class;
+       struct xnsched_class *sched_class = thread->base_class;
 
-       if (sched_class != &xnsched_class_idle)
-               __xnsched_rt_setparam(thread, p);
-       else
-               __xnsched_idle_setparam(thread, p);
+       if (sched_class == &xnsched_class_idle)
+               return __xnsched_idle_setparam(thread, p);
 
-       thread->wprio = thread->cprio + sched_class->weight;
+       return __xnsched_rt_setparam(thread, p);
 }
 
 static inline void xnsched_getparam(struct xnthread *thread,
@@ -617,10 +659,10 @@ static inline void xnsched_getparam(struct xnthread 
*thread,
 {
        struct xnsched_class *sched_class = thread->sched_class;
 
-       if (sched_class != &xnsched_class_idle)
-               __xnsched_rt_getparam(thread, p);
-       else
+       if (sched_class == &xnsched_class_idle)
                __xnsched_idle_getparam(thread, p);
+       else
+               __xnsched_rt_getparam(thread, p);
 }
 
 static inline void xnsched_trackprio(struct xnthread *thread,
@@ -628,12 +670,24 @@ static inline void xnsched_trackprio(struct xnthread 
*thread,
 {
        struct xnsched_class *sched_class = thread->sched_class;
 
-       if (sched_class != &xnsched_class_idle)
+       if (sched_class == &xnsched_class_idle)
+               __xnsched_idle_trackprio(thread, p);
+       else
                __xnsched_rt_trackprio(thread, p);
+
+       thread->wprio = xnsched_calc_wprio(sched_class, thread->cprio);
+}
+
+static inline void xnsched_protectprio(struct xnthread *thread, int prio)
+{
+       struct xnsched_class *sched_class = thread->sched_class;
+
+       if (sched_class == &xnsched_class_idle)
+               __xnsched_idle_protectprio(thread, prio);
        else
-               __xnsched_idle_trackprio(thread, p);
+               __xnsched_rt_protectprio(thread, prio);
 
-       thread->wprio = thread->cprio + sched_class->weight;
+       thread->wprio = xnsched_calc_wprio(sched_class, thread->cprio);
 }
 
 static inline void xnsched_forget(struct xnthread *thread)
diff --git a/include/cobalt/kernel/synch.h b/include/cobalt/kernel/synch.h
index ffb884a..04d7f10 100644
--- a/include/cobalt/kernel/synch.h
+++ b/include/cobalt/kernel/synch.h
@@ -23,12 +23,14 @@
 #include <cobalt/kernel/assert.h>
 #include <cobalt/kernel/timer.h>
 #include <cobalt/uapi/kernel/synch.h>
+#include <cobalt/uapi/kernel/thread.h>
 
 /**
  * @addtogroup cobalt_core_synch
  * @{
  */
-#define XNSYNCH_CLAIMED 0x10   /* Claimed by other thread(s) w/ PIP */
+#define XNSYNCH_CLAIMED  0x100 /* Claimed by other thread(s) (PI) */
+#define XNSYNCH_CEILING  0x200 /* Actively boosting (PP) */
 
 /* Spare flags usable by upper interfaces */
 #define XNSYNCH_SPARE0  0x01000000
@@ -49,13 +51,25 @@ struct xnthread;
 struct xnsynch;
 
 struct xnsynch {
-       struct list_head link;  /** thread->claimq */
-       int wprio;              /** wait prio in claimq */
-       unsigned long status;    /** Status word */
-       struct list_head pendq;  /** Pending threads */
-       struct xnthread *owner; /** Thread which owns the resource */
-       atomic_t *fastlock; /** Pointer to fast lock word */
-       void (*cleanup)(struct xnsynch *synch); /* Cleanup handler */
+       /** wait (weighted) prio in thread->boosters */
+       int wprio;
+       /** thread->boosters */
+       struct list_head next;
+       /**
+        *  &variable holding the current priority ceiling value
+        *  (xnsched_class_rt-based, [1..255], XNSYNCH_PP).
+        */
+       u32 *ceiling_ref;
+       /** Status word */
+       unsigned long status;
+       /** Pending threads */
+       struct list_head pendq;
+       /** Thread which owns the resource */
+       struct xnthread *owner;
+        /** Pointer to fast lock word */
+       atomic_t *fastlock;
+       /* Cleanup handler */
+       void (*cleanup)(struct xnsynch *synch);
 };
 
 #define XNSYNCH_WAITQUEUE_INITIALIZER(__name) {                \
@@ -106,31 +120,26 @@ static inline struct xnthread *xnsynch_owner(struct 
xnsynch *synch)
 void xnsynch_detect_relaxed_owner(struct xnsynch *synch,
                                  struct xnthread *sleeper);
 
-void xnsynch_detect_claimed_relax(struct xnthread *owner);
+void xnsynch_detect_boosted_relax(struct xnthread *owner);
 
 #else /* !XENO_DEBUG(MUTEX_RELAXED) */
 
 static inline void xnsynch_detect_relaxed_owner(struct xnsynch *synch,
-                                 struct xnthread *sleeper)
-{
-}
+                                 struct xnthread *sleeper) { }
 
-static inline void xnsynch_detect_claimed_relax(struct xnthread *owner)
-{
-}
+static inline void xnsynch_detect_boosted_relax(struct xnthread *owner) { }
 
 #endif /* !XENO_DEBUG(MUTEX_RELAXED) */
 
 void xnsynch_init(struct xnsynch *synch, int flags,
                  atomic_t *fastlock);
 
+void xnsynch_init_protect(struct xnsynch *synch, int flags,
+                         atomic_t *fastlock, u32 *ceiling_ref);
+
 int xnsynch_destroy(struct xnsynch *synch);
 
-static inline void xnsynch_set_owner(struct xnsynch *synch,
-                                    struct xnthread *thread)
-{
-       synch->owner = thread;
-}
+void xnsynch_commit_ceiling(struct xnthread *curr);
 
 static inline void xnsynch_register_cleanup(struct xnsynch *synch,
                                            void (*handler)(struct xnsynch *))
@@ -162,8 +171,6 @@ struct xnthread *xnsynch_peek_pendq(struct xnsynch *synch);
 
 int xnsynch_flush(struct xnsynch *synch, int reason);
 
-void xnsynch_release_all_ownerships(struct xnthread *thread);
-
 void xnsynch_requeue_sleeper(struct xnthread *thread);
 
 void xnsynch_forget_sleeper(struct xnthread *thread);
diff --git a/include/cobalt/kernel/thread.h b/include/cobalt/kernel/thread.h
index b45de8e..07b6996 100644
--- a/include/cobalt/kernel/thread.h
+++ b/include/cobalt/kernel/thread.h
@@ -110,9 +110,11 @@ struct xnthread {
 #endif
        cpumask_t affinity;     /* Processor affinity. */
 
-       int bprio;              /* Base priority (before PIP boost) */
+       /** Base priority (before PI/PP boost) */
+       int bprio;
 
-       int cprio;              /* Current priority */
+       /** Current (effective) priority */
+       int cprio;
 
        /**
         * Weighted priority (cprio + scheduling class weight).
@@ -122,7 +124,7 @@ struct xnthread {
        int lock_count; /** Scheduler lock count. */
 
        /**
-        * Thread holder in xnsched runnable queue. Prioritized by
+        * Thread holder in xnsched run queue. Ordered by
         * thread->cprio.
         */
        struct list_head rlink;
@@ -137,10 +139,18 @@ struct xnthread {
        struct list_head glink;
 
        /**
-        * List of xnsynch owned by this thread _and_ claimed by
-        * others (PIP).
+        * List of xnsynch owned by this thread which cause a priority
+        * boost due to one of the following reasons:
+        *
+        * - they are currently claimed by other thread(s) when
+        * enforcing the priority inheritance protocol (XNSYNCH_PI).
+        *
+        * - they require immediate priority ceiling (XNSYNCH_PP).
+        *
+        * This list is ordered by decreasing (weighted) thread
+        * priorities.
         */
-       struct list_head claimq;
+       struct list_head boosters;
 
        struct xnsynch *wchan;          /* Resource the thread pends on */
 
@@ -268,11 +278,11 @@ static inline pid_t xnthread_host_pid(struct xnthread 
*thread)
        return task_pid_nr(xnthread_host_task(thread));
 }
 
-#define xnthread_for_each_claimed(__pos, __thread)             \
-       list_for_each_entry(__pos, &(__thread)->claimq, link)
+#define xnthread_for_each_booster(__pos, __thread)             \
+       list_for_each_entry(__pos, &(__thread)->boosters, next)
 
-#define xnthread_for_each_claimed_safe(__pos, __tmp, __thread) \
-       list_for_each_entry_safe(__pos, __tmp, &(__thread)->claimq, link)
+#define xnthread_for_each_booster_safe(__pos, __tmp, __thread) \
+       list_for_each_entry_safe(__pos, __tmp, &(__thread)->boosters, next)
 
 #define xnthread_run_handler(__t, __h, __a...)                         \
        do {                                                            \
@@ -501,28 +511,34 @@ int xnthread_map(struct xnthread *thread,
 
 void xnthread_call_mayday(struct xnthread *thread, int reason);
 
-static inline void xnthread_get_resource(struct xnthread *thread)
+static inline void xnthread_get_resource(struct xnthread *curr)
 {
-       if (xnthread_test_state(thread, XNWEAK|XNDEBUG))
-               thread->res_count++;
+       if (xnthread_test_state(curr, XNWEAK|XNDEBUG))
+               curr->res_count++;
 }
 
-static inline int xnthread_put_resource(struct xnthread *thread)
+static inline int xnthread_put_resource(struct xnthread *curr)
 {
-       if (xnthread_test_state(thread, XNWEAK) ||
+       if (xnthread_test_state(curr, XNWEAK) ||
            IS_ENABLED(CONFIG_XENO_OPT_DEBUG_MUTEX_SLEEP)) {
-               if (unlikely(thread->res_count == 0)) {
-                       if (xnthread_test_state(thread, XNWARN))
-                               xnthread_signal(thread, SIGDEBUG,
+               if (unlikely(curr->res_count == 0)) {
+                       if (xnthread_test_state(curr, XNWARN))
+                               xnthread_signal(curr, SIGDEBUG,
                                                SIGDEBUG_RESCNT_IMBALANCE);
                        return -EPERM;
                }
-               thread->res_count--;
+               curr->res_count--;
        }
 
        return 0;
 }
 
+static inline void xnthread_commit_ceiling(struct xnthread *curr)
+{
+       if (curr->u_window->pp_pending)
+               xnsynch_commit_ceiling(curr);
+}
+
 #ifdef CONFIG_SMP
 int xnthread_migrate(int cpu);
 
diff --git a/include/cobalt/uapi/asm-generic/features.h 
b/include/cobalt/uapi/asm-generic/features.h
index 92f7841..8a4927c 100644
--- a/include/cobalt/uapi/asm-generic/features.h
+++ b/include/cobalt/uapi/asm-generic/features.h
@@ -50,6 +50,7 @@ struct cobalt_featinfo {
 #define __xn_feat_fastsynch   0x20000000
 #define __xn_feat_nofastsynch 0x10000000
 #define __xn_feat_control     0x08000000
+#define __xn_feat_prioceiling 0x04000000
 
 #ifdef CONFIG_SMP
 #define __xn_feat_smp_mask __xn_feat_smp
@@ -57,18 +58,32 @@ struct cobalt_featinfo {
 #define __xn_feat_smp_mask __xn_feat_nosmp
 #endif
 
+/*
+ * Revisit: all archs currently support fast locking, and there is no
+ * reason for any future port not to provide this. This will be
+ * written in stone at the next ABI update, when fastsynch support is
+ * dropped from the optional feature set.
+ */
 #define __xn_feat_fastsynch_mask __xn_feat_fastsynch
 
 /* List of generic features kernel or userland may support */
-#define __xn_feat_generic_mask \
-       (__xn_feat_smp_mask | __xn_feat_fastsynch_mask)
+#define __xn_feat_generic_mask                 \
+       (__xn_feat_smp_mask             |       \
+        __xn_feat_fastsynch_mask       |       \
+        __xn_feat_prioceiling)
 
 /*
  * List of features both sides have to agree on: If userland supports
- * it, the kernel has to provide it, too.
+ * it, the kernel has to provide it, too. This means backward
+ * compatibility between older userland and newer kernel may be
+ * supported for those features, but forward compatibility between
+ * newer userland and older kernel cannot.
  */
-#define __xn_feat_generic_man_mask \
-       (__xn_feat_fastsynch | __xn_feat_nofastsynch | __xn_feat_nosmp)
+#define __xn_feat_generic_man_mask             \
+       (__xn_feat_fastsynch            |       \
+        __xn_feat_nofastsynch          |       \
+        __xn_feat_nosmp                |       \
+        __xn_feat_prioceiling)
 
 static inline
 const char *get_generic_feature_label(unsigned int feature)
@@ -84,6 +99,8 @@ const char *get_generic_feature_label(unsigned int feature)
                return "nofastsynch";
        case __xn_feat_control:
                return "control";
+       case __xn_feat_prioceiling:
+               return "prioceiling";
        default:
                return 0;
        }
diff --git a/include/cobalt/uapi/kernel/synch.h 
b/include/cobalt/uapi/kernel/synch.h
index 6947058..33d7ab3 100644
--- a/include/cobalt/uapi/kernel/synch.h
+++ b/include/cobalt/uapi/kernel/synch.h
@@ -24,10 +24,10 @@
 /* Creation flags */
 #define XNSYNCH_FIFO    0x0
 #define XNSYNCH_PRIO    0x1
-#define XNSYNCH_NOPIP   0x0
-#define XNSYNCH_PIP     0x2
+#define XNSYNCH_PI      0x2
 #define XNSYNCH_DREORD  0x4
 #define XNSYNCH_OWNER   0x8
+#define XNSYNCH_PP      0x10
 
 /* Fast lock API */
 static inline int xnsynch_fast_is_claimed(xnhandle_t handle)
@@ -40,9 +40,9 @@ static inline xnhandle_t xnsynch_fast_claimed(xnhandle_t 
handle)
        return handle | XNSYNCH_FLCLAIM;
 }
 
-static inline xnhandle_t xnsynch_fast_not_claimed(xnhandle_t handle)
+static inline xnhandle_t xnsynch_fast_ceiling(xnhandle_t handle)
 {
-       return handle & ~XNSYNCH_FLCLAIM;
+       return handle | XNSYNCH_FLCEIL;
 }
 
 static inline int
diff --git a/include/cobalt/uapi/kernel/thread.h 
b/include/cobalt/uapi/kernel/thread.h
index 89853e8..0cfc5c8 100644
--- a/include/cobalt/uapi/kernel/thread.h
+++ b/include/cobalt/uapi/kernel/thread.h
@@ -39,7 +39,7 @@
 #define XNRELAX   0x00000080 /**< Relaxed shadow thread (blocking bit) */
 #define XNMIGRATE 0x00000100 /**< Thread is currently migrating to another 
CPU. */
 #define XNHELD    0x00000200 /**< Thread is held to process emergency. */
-#define XNBOOST   0x00000400 /**< Undergoes a PIP boost */
+#define XNBOOST   0x00000400 /**< PI/PP boost undergoing */
 #define XNSSTEP   0x00000800 /**< Single-stepped by debugger */
 #define XNLOCK    0x00001000 /**< Scheduler lock control (pseudo-bit, not in 
->state) */
 #define XNRRB     0x00002000 /**< Undergoes a round-robin scheduling */
@@ -103,6 +103,7 @@ struct xnthread_user_window {
        __u32 state;
        __u32 info;
        __u32 grant_value;
+       __u32 pp_pending;
 };
 
 #endif /* !_COBALT_UAPI_KERNEL_THREAD_H */
diff --git a/include/cobalt/uapi/kernel/types.h 
b/include/cobalt/uapi/kernel/types.h
index 6064249..ee5bbad 100644
--- a/include/cobalt/uapi/kernel/types.h
+++ b/include/cobalt/uapi/kernel/types.h
@@ -34,9 +34,10 @@ typedef __u32 xnhandle_t;
 #define XNSYNCH_PSHARED                ((xnhandle_t)0x40000000)
 
 /* Transient bits (expressing a status) */
-#define XNSYNCH_FLCLAIM                ((xnhandle_t)0x80000000)
+#define XNSYNCH_FLCLAIM                ((xnhandle_t)0x80000000) /* Contended. 
*/
+#define XNSYNCH_FLCEIL         ((xnhandle_t)0x20000000) /* Ceiling active. */
 
-#define XN_HANDLE_TRANSIENT_MASK       XNSYNCH_FLCLAIM
+#define XN_HANDLE_TRANSIENT_MASK       (XNSYNCH_FLCLAIM|XNSYNCH_FLCEIL)
 
 /*
  * Strip all special bits from the handle, only retaining the object
diff --git a/include/cobalt/uapi/mutex.h b/include/cobalt/uapi/mutex.h
index 5b38783..75e34f9 100644
--- a/include/cobalt/uapi/mutex.h
+++ b/include/cobalt/uapi/mutex.h
@@ -27,6 +27,7 @@ struct cobalt_mutex_state {
        __u32 flags;
 #define COBALT_MUTEX_COND_SIGNAL 0x00000001
 #define COBALT_MUTEX_ERRORCHECK  0x00000002
+       __u32 ceiling;
 };
 
 union cobalt_mutex_union {
diff --git a/include/cobalt/uapi/thread.h b/include/cobalt/uapi/thread.h
index 28fc681..07602db 100644
--- a/include/cobalt/uapi/thread.h
+++ b/include/cobalt/uapi/thread.h
@@ -30,6 +30,7 @@ struct cobalt_mutexattr {
        int protocol : 3;
        int pshared : 1;
        int __pad : 1;
+       int ceiling : 8;  /* prio-1, (XN)SCHED_FIFO range. */
 };
 
 struct cobalt_condattr {
diff --git a/kernel/cobalt/posix/cond.c b/kernel/cobalt/posix/cond.c
index 84ae8fd..8ad39f3 100644
--- a/kernel/cobalt/posix/cond.c
+++ b/kernel/cobalt/posix/cond.c
@@ -26,7 +26,7 @@
 static inline int
 pthread_cond_init(struct cobalt_cond_shadow *cnd, const struct cobalt_condattr 
*attr)
 {
-       int synch_flags = XNSYNCH_PRIO | XNSYNCH_NOPIP, ret;
+       int synch_flags = XNSYNCH_PRIO, ret;
        struct cobalt_cond *cond, *old_cond;
        struct cobalt_cond_state *state;
        struct cobalt_ppd *sys_ppd;
diff --git a/kernel/cobalt/posix/memory.h b/kernel/cobalt/posix/memory.h
index 83b499d..c22417b 100644
--- a/kernel/cobalt/posix/memory.h
+++ b/kernel/cobalt/posix/memory.h
@@ -32,6 +32,12 @@ void *cobalt_umm_alloc(struct cobalt_umm *umm, __u32 size)
 }
 
 static inline
+void *cobalt_umm_zalloc(struct cobalt_umm *umm, __u32 size)
+{
+       return xnheap_zalloc(&umm->heap, size);
+}
+
+static inline
 void cobalt_umm_free(struct cobalt_umm *umm, void *p)
 {
        xnheap_free(&umm->heap, p);
diff --git a/kernel/cobalt/posix/monitor.c b/kernel/cobalt/posix/monitor.c
index f3662b6..e30a92b 100644
--- a/kernel/cobalt/posix/monitor.c
+++ b/kernel/cobalt/posix/monitor.c
@@ -84,7 +84,7 @@ COBALT_SYSCALL(monitor_init, current,
        }
 
        mon->state = state;
-       xnsynch_init(&mon->gate, XNSYNCH_PIP, &state->owner);
+       xnsynch_init(&mon->gate, XNSYNCH_PI, &state->owner);
        xnsynch_init(&mon->drain, XNSYNCH_PRIO, NULL);
        mon->flags = flags;
        mon->tmode = tmode;
@@ -109,33 +109,16 @@ COBALT_SYSCALL(monitor_init, current,
 static int monitor_enter(xnhandle_t handle, struct xnthread *curr)
 {
        struct cobalt_monitor *mon;
-       int ret = 0, info;
+       int info;
 
        mon = xnregistry_lookup(handle, NULL); /* (Re)validate. */
        if (mon == NULL || mon->magic != COBALT_MONITOR_MAGIC)
                return -EINVAL;
 
-       /*
-        * The monitor might have been exited while we were jumping
-        * there for waiting at the gate, lock atomically and return
-        * if so.
-        *
-        * NOTE: monitors do not support recursive entries.
-        */
-       ret = xnsynch_fast_acquire(mon->gate.fastlock, curr->handle);
-       switch(ret) {
-       case 0:
-               xnthread_get_resource(curr);
-               break;
-       default:
-               /* Nah, we really have to wait. */
-               info = xnsynch_acquire(&mon->gate, XN_INFINITE, XN_RELATIVE);
-               if (info & XNBREAK)
-                       return -EINTR;
-               if (info)       /* No timeout possible. */
-                       return -EINVAL;
-               break;
-       }
+       info = xnsynch_acquire(&mon->gate, XN_INFINITE, XN_RELATIVE);
+       if (info)
+               /* Break or error, no timeout possible. */
+               return info & XNBREAK ? -EINTR : -EINVAL;
 
        mon->state->flags &= 
~(COBALT_MONITOR_SIGNALED|COBALT_MONITOR_BROADCAST);
 
diff --git a/kernel/cobalt/posix/mqueue.c b/kernel/cobalt/posix/mqueue.c
index 77dabce..6357d22 100644
--- a/kernel/cobalt/posix/mqueue.c
+++ b/kernel/cobalt/posix/mqueue.c
@@ -134,8 +134,8 @@ static inline int mq_init(struct cobalt_mq *mq, const 
struct mq_attr *attr)
        mq->memsize = memsize;
        INIT_LIST_HEAD(&mq->queued);
        mq->nrqueued = 0;
-       xnsynch_init(&mq->receivers, XNSYNCH_PRIO | XNSYNCH_NOPIP, NULL);
-       xnsynch_init(&mq->senders, XNSYNCH_PRIO | XNSYNCH_NOPIP, NULL);
+       xnsynch_init(&mq->receivers, XNSYNCH_PRIO, NULL);
+       xnsynch_init(&mq->senders, XNSYNCH_PRIO, NULL);
        mq->mem = mem;
 
        /* Fill the pool. */
diff --git a/kernel/cobalt/posix/mutex.c b/kernel/cobalt/posix/mutex.c
index 58b4f3f..c6020ff 100644
--- a/kernel/cobalt/posix/mutex.c
+++ b/kernel/cobalt/posix/mutex.c
@@ -30,11 +30,11 @@ static int cobalt_mutex_init_inner(struct 
cobalt_mutex_shadow *shadow,
        int synch_flags = XNSYNCH_PRIO | XNSYNCH_OWNER;
        struct cobalt_umm *umm;
        spl_t s;
-       int err;
+       int ret;
 
-       err = xnregistry_enter_anon(mutex, &mutex->resnode.handle);
-       if (err < 0)
-               return err;
+       ret = xnregistry_enter_anon(mutex, &mutex->resnode.handle);
+       if (ret < 0)
+               return ret;
 
        umm = &cobalt_ppd_get(attr->pshared)->umm;
        shadow->handle = mutex->resnode.handle;
@@ -43,11 +43,19 @@ static int cobalt_mutex_init_inner(struct 
cobalt_mutex_shadow *shadow,
        shadow->attr = *attr;
        shadow->state_offset = cobalt_umm_offset(umm, state);
 
-       if (attr->protocol == PTHREAD_PRIO_INHERIT)
-               synch_flags |= XNSYNCH_PIP;
-
        mutex->magic = COBALT_MUTEX_MAGIC;
-       xnsynch_init(&mutex->synchbase, synch_flags, &state->owner);
+
+       if (attr->protocol == PTHREAD_PRIO_PROTECT) {
+               state->ceiling = attr->ceiling + 1;
+               xnsynch_init_protect(&mutex->synchbase, synch_flags,
+                                    &state->owner, &state->ceiling);
+       } else {
+               state->ceiling = 0;
+               if (attr->protocol == PTHREAD_PRIO_INHERIT)
+                       synch_flags |= XNSYNCH_PI;
+               xnsynch_init(&mutex->synchbase, synch_flags, &state->owner);
+       }
+
        state->flags = (attr->type == PTHREAD_MUTEX_ERRORCHECK
                        ? COBALT_MUTEX_ERRORCHECK : 0);
        mutex->attr = *attr;
@@ -85,9 +93,9 @@ int __cobalt_mutex_acquire_unchecked(struct xnthread *cur,
        return 0;
 }
 
-int cobalt_mutex_release(struct xnthread *cur,
+int cobalt_mutex_release(struct xnthread *curr,
                         struct cobalt_mutex *mutex)
-{
+{      /* nklock held, irqs off */
        struct cobalt_mutex_state *state;
        struct cobalt_cond *cond;
        unsigned long flags;
@@ -101,6 +109,14 @@ int cobalt_mutex_release(struct xnthread *cur,
            cobalt_current_resources(mutex->attr.pshared))
                return -EPERM;
 
+       /*
+        * We are about to release a mutex which is still pending PP
+        * (i.e. we never got scheduled out while holding it). Clear
+        * the lazy handle.
+        */
+       if (mutex->resnode.handle == curr->u_window->pp_pending)
+               curr->u_window->pp_pending = XN_NO_HANDLE;
+
        state = container_of(mutex->synchbase.fastlock, struct 
cobalt_mutex_state, owner);
        flags = state->flags;
        need_resched = 0;
@@ -112,7 +128,7 @@ int cobalt_mutex_release(struct xnthread *cur,
                                cobalt_cond_deferred_signals(cond);
                }
        }
-       need_resched |= xnsynch_release(&mutex->synchbase, cur) != NULL;
+       need_resched |= xnsynch_release(&mutex->synchbase, curr) != NULL;
 
        return need_resched;
 }
@@ -138,7 +154,6 @@ redo:
        xnlock_get_irqsave(&nklock, s);
 
        mutex = xnregistry_lookup(handle, NULL);
-
        if (!cobalt_obj_active(mutex, COBALT_MUTEX_MAGIC, struct cobalt_mutex)) 
{
                ret = -EINVAL;
                goto out;
@@ -151,6 +166,8 @@ redo:
                goto out;
        }
 
+       xnthread_commit_ceiling(curr);
+
        if (xnsynch_owner_check(&mutex->synchbase, curr)) {
                if (fetch_timeout) {
                        xnlock_put_irqrestore(&nklock, s);
@@ -223,10 +240,10 @@ COBALT_SYSCALL(mutex_init, current,
                const struct cobalt_mutexattr __user *u_attr))
 {
        struct cobalt_mutex_state *state;
-       struct cobalt_mutexattr attr;
        struct cobalt_mutex_shadow mx;
+       struct cobalt_mutexattr attr;
        struct cobalt_mutex *mutex;
-       int err;
+       int ret;
 
        if (cobalt_copy_from_user(&mx, u_mx, sizeof(mx)))
                return -EFAULT;
@@ -245,11 +262,11 @@ COBALT_SYSCALL(mutex_init, current,
                return -EAGAIN;
        }
 
-       err = cobalt_mutex_init_inner(&mx, mutex, state, &attr);
-       if (err) {
+       ret = cobalt_mutex_init_inner(&mx, mutex, state, &attr);
+       if (ret) {
                xnfree(mutex);
                cobalt_umm_free(&cobalt_ppd_get(attr.pshared)->umm, state);
-               return err;
+               return ret;
        }
 
        return cobalt_copy_to_user(u_mx, &mx, sizeof(*u_mx));
@@ -303,19 +320,22 @@ COBALT_SYSCALL(mutex_trylock, primary,
        struct cobalt_mutex *mutex;
        xnhandle_t handle;
        spl_t s;
-       int err;
+       int ret;
 
        handle = cobalt_get_handle_from_user(&u_mx->handle);
 
        xnlock_get_irqsave(&nklock, s);
+
        mutex = xnregistry_lookup(handle, NULL);
        if (!cobalt_obj_active(mutex, COBALT_MUTEX_MAGIC, typeof(*mutex))) {
-               err = -EINVAL;
-               goto err_unlock;
+               ret = -EINVAL;
+               goto out;
        }
 
-       err = xnsynch_fast_acquire(mutex->synchbase.fastlock, curr->handle);
-       switch(err) {
+       xnthread_commit_ceiling(curr);
+
+       ret = xnsynch_fast_acquire(mutex->synchbase.fastlock, curr->handle);
+       switch(ret) {
        case 0:
                xnthread_get_resource(curr);
                break;
@@ -323,17 +343,17 @@ COBALT_SYSCALL(mutex_trylock, primary,
 /* This should not happen, as recursive mutexes are handled in
    user-space */
        case -EBUSY:
-               err = -EINVAL;
+               ret = -EINVAL;
                break;
 
        case -EAGAIN:
-               err = -EBUSY;
+               ret = -EBUSY;
                break;
        }
-  err_unlock:
+out:
        xnlock_put_irqrestore(&nklock, s);
 
-       return err;
+       return ret;
 }
 
 COBALT_SYSCALL(mutex_lock, primary,
@@ -362,26 +382,24 @@ COBALT_SYSCALL(mutex_unlock, nonrestartable,
        struct cobalt_mutex *mutex;
        struct xnthread *curr;
        xnhandle_t handle;
-       int err;
+       int ret;
        spl_t s;
 
        handle = cobalt_get_handle_from_user(&u_mx->handle);
        curr = xnthread_current();
 
        xnlock_get_irqsave(&nklock, s);
-       mutex = xnregistry_lookup(handle, NULL);
-       err = cobalt_mutex_release(curr, mutex);
-       if (err < 0)
-               goto out;
 
-       if (err) {
+       mutex = xnregistry_lookup(handle, NULL);
+       ret = cobalt_mutex_release(curr, mutex);
+       if (ret > 0) {
                xnsched_run();
-               err = 0;
+               ret = 0;
        }
- out:
+
        xnlock_put_irqrestore(&nklock, s);
 
-       return err;
+       return ret;
 }
 
 void cobalt_mutex_reclaim(struct cobalt_resnode *node, spl_t s)
@@ -402,3 +420,18 @@ void cobalt_mutex_reclaim(struct cobalt_resnode *node, 
spl_t s)
        cobalt_umm_free(&cobalt_ppd_get(pshared)->umm, state);
        xnfree(mutex);
 }
+
+struct xnsynch *lookup_lazy_pp(xnhandle_t handle)
+{                              /* nklock held, irqs off */
+       struct cobalt_mutex *mutex;
+
+       /* Only mutexes may be PP-enabled. */
+       
+       mutex = xnregistry_lookup(handle, NULL);
+       if (mutex == NULL ||
+           !cobalt_obj_active(mutex, COBALT_MUTEX_MAGIC, struct cobalt_mutex) 
||
+           mutex->attr.protocol != PTHREAD_PRIO_PROTECT)
+               return NULL;
+
+       return &mutex->synchbase;
+}
diff --git a/kernel/cobalt/posix/process.c b/kernel/cobalt/posix/process.c
index 5c152e8..0d225bb 100644
--- a/kernel/cobalt/posix/process.c
+++ b/kernel/cobalt/posix/process.c
@@ -349,15 +349,22 @@ int cobalt_bind_personality(unsigned int magic)
        return ret ?: xid;
 }
 
-int cobalt_bind_core(void)
+int cobalt_bind_core(int ufeatures)
 {
+       struct cobalt_process *process;
        int ret;
 
        mutex_lock(&personality_lock);
        ret = bind_personality(&cobalt_personality);
        mutex_unlock(&personality_lock);
+       if (ret)
+               return ret;
 
-       return ret;
+       process = cobalt_current_process();
+       /* Feature set userland knows about. */
+       process->ufeatures = ufeatures;
+
+       return 0;
 }
 
 /**
@@ -668,7 +675,7 @@ int cobalt_map_user(struct xnthread *thread, __u32 __user 
*u_winoff)
                return ret;
 
        umm = &cobalt_kernel_ppd.umm;
-       u_window = cobalt_umm_alloc(umm, sizeof(*u_window));
+       u_window = cobalt_umm_zalloc(umm, sizeof(*u_window));
        if (u_window == NULL)
                return -ENOMEM;
 
@@ -1215,7 +1222,7 @@ static int handle_cleanup_event(struct mm_struct *mm)
 {
        struct cobalt_process *old, *process;
        struct cobalt_ppd *sys_ppd;
-       struct xnthread *thread;
+       struct xnthread *curr;
 
        /*
         * We are NOT called for exiting kernel shadows.
@@ -1238,8 +1245,8 @@ static int handle_cleanup_event(struct mm_struct *mm)
                 * running though, we have to disable the event
                 * notifier manually for it.
                 */
-               thread = xnthread_current();
-               running_exec = thread && (current->flags & PF_EXITING) == 0;
+               curr = xnthread_current();
+               running_exec = curr && (current->flags & PF_EXITING) == 0;
                if (running_exec) {
                        __handle_taskexit_event(current);
                        ipipe_disable_notifier(current);
@@ -1247,7 +1254,7 @@ static int handle_cleanup_event(struct mm_struct *mm)
                if (atomic_dec_and_test(&sys_ppd->refcnt))
                        remove_process(process);
                if (running_exec) {
-                       __xnthread_cleanup(thread);
+                       __xnthread_cleanup(curr);
                        clear_threadinfo();
                }
        }
diff --git a/kernel/cobalt/posix/process.h b/kernel/cobalt/posix/process.h
index ee4b225..ad1f4c5 100644
--- a/kernel/cobalt/posix/process.h
+++ b/kernel/cobalt/posix/process.h
@@ -54,6 +54,7 @@ struct cobalt_process {
        DECLARE_BITMAP(timers_map, CONFIG_XENO_OPT_NRTIMERS);
        struct cobalt_timer *timers[CONFIG_XENO_OPT_NRTIMERS];
        void *priv[NR_PERSONALITIES];
+       int ufeatures;
 };
 
 struct cobalt_resnode {
@@ -71,7 +72,7 @@ struct xnthread_personality *cobalt_push_personality(int xid);
 
 void cobalt_pop_personality(struct xnthread_personality *prev);
 
-int cobalt_bind_core(void);
+int cobalt_bind_core(int ufeatures);
 
 int cobalt_bind_personality(unsigned int magic);
 
diff --git a/kernel/cobalt/posix/syscall.c b/kernel/cobalt/posix/syscall.c
index f7cde92..a2e87ab 100644
--- a/kernel/cobalt/posix/syscall.c
+++ b/kernel/cobalt/posix/syscall.c
@@ -314,6 +314,10 @@ static COBALT_SYSCALL(bind, lostage,
        if (!realtime_core_running() && (featreq & __xn_feat_control) == 0)
                return -EPERM;
 
+       /*
+        * Calculate the missing feature set:
+        * kernel_unavailable_set & user_mandatory_set.
+        */
        featmis = (~XENOMAI_FEAT_DEP & (featreq & XENOMAI_FEAT_MAN));
        abirev = breq.abi_rev;
 
@@ -352,7 +356,7 @@ static COBALT_SYSCALL(bind, lostage,
        if (!check_abi_revision(abirev))
                return -ENOEXEC;
 
-       return cobalt_bind_core();
+       return cobalt_bind_core(featreq);
 }
 
 static COBALT_SYSCALL(extend, lostage, (unsigned int magic))
diff --git a/kernel/cobalt/posix/timerfd.c b/kernel/cobalt/posix/timerfd.c
index 90c4b3f..17fc16a 100644
--- a/kernel/cobalt/posix/timerfd.c
+++ b/kernel/cobalt/posix/timerfd.c
@@ -191,7 +191,7 @@ COBALT_SYSCALL(timerfd_create, lostage, (int clockid, int 
flags))
        curr = xnthread_current();
        xntimer_init(&tfd->timer, &nkclock, timerfd_handler,
                     curr ? curr->sched : NULL, XNTIMER_UGRAVITY);
-       xnsynch_init(&tfd->readers, XNSYNCH_PRIO | XNSYNCH_NOPIP, NULL);
+       xnsynch_init(&tfd->readers, XNSYNCH_PRIO, NULL);
        xnselect_init(&tfd->read_select);
        tfd->target = NULL;
 
diff --git a/kernel/cobalt/rtdm/drvlib.c b/kernel/cobalt/rtdm/drvlib.c
index 481ad83..cdec68d 100644
--- a/kernel/cobalt/rtdm/drvlib.c
+++ b/kernel/cobalt/rtdm/drvlib.c
@@ -1162,7 +1162,7 @@ void rtdm_mutex_init(rtdm_mutex_t *mutex)
 
        /* Make atomic for re-initialisation support */
        xnlock_get_irqsave(&nklock, s);
-       xnsynch_init(&mutex->synch_base, XNSYNCH_PIP, &mutex->fastlock);
+       xnsynch_init(&mutex->synch_base, XNSYNCH_PI, &mutex->fastlock);
        xnlock_put_irqrestore(&nklock, s);
 }
 EXPORT_SYMBOL_GPL(rtdm_mutex_init);
diff --git a/kernel/cobalt/sched-idle.c b/kernel/cobalt/sched-idle.c
index 8f59c0d..e5ca8ae 100644
--- a/kernel/cobalt/sched-idle.c
+++ b/kernel/cobalt/sched-idle.c
@@ -23,10 +23,10 @@ static struct xnthread *xnsched_idle_pick(struct xnsched 
*sched)
        return &sched->rootcb;
 }
 
-void xnsched_idle_setparam(struct xnthread *thread,
+bool xnsched_idle_setparam(struct xnthread *thread,
                           const union xnsched_policy_param *p)
 {
-       __xnsched_idle_setparam(thread, p);
+       return __xnsched_idle_setparam(thread, p);
 }
 
 void xnsched_idle_getparam(struct xnthread *thread,
@@ -38,7 +38,12 @@ void xnsched_idle_getparam(struct xnthread *thread,
 void xnsched_idle_trackprio(struct xnthread *thread,
                           const union xnsched_policy_param *p)
 {
-       __xnsched_rt_trackprio(thread, p);
+       __xnsched_idle_trackprio(thread, p);
+}
+
+void xnsched_idle_protectprio(struct xnthread *thread, int prio)
+{
+       __xnsched_idle_protectprio(thread, prio);
 }
 
 struct xnsched_class xnsched_class_idle = {
@@ -55,6 +60,7 @@ struct xnsched_class xnsched_class_idle = {
        .sched_setparam         =       xnsched_idle_setparam,
        .sched_getparam         =       xnsched_idle_getparam,
        .sched_trackprio        =       xnsched_idle_trackprio,
+       .sched_protectprio      =       xnsched_idle_protectprio,
        .weight                 =       XNSCHED_CLASS_WEIGHT(0),
        .policy                 =       SCHED_IDLE,
        .name                   =       "idle"
diff --git a/kernel/cobalt/sched-quota.c b/kernel/cobalt/sched-quota.c
index d0c9ac8..1cab539 100644
--- a/kernel/cobalt/sched-quota.c
+++ b/kernel/cobalt/sched-quota.c
@@ -237,14 +237,15 @@ static void xnsched_quota_init(struct xnsched *sched)
        xntimer_set_name(&qs->limit_timer, limiter_name);
 }
 
-static void xnsched_quota_setparam(struct xnthread *thread,
+static bool xnsched_quota_setparam(struct xnthread *thread,
                                   const union xnsched_policy_param *p)
 {
        struct xnsched_quota_group *tg;
        struct xnsched_quota *qs;
+       bool effective;
 
        xnthread_clear_state(thread, XNWEAK);
-       thread->cprio = p->quota.prio;
+       effective = xnsched_set_effective_priority(thread, p->quota.prio);
 
        qs = &thread->sched->quota;
        list_for_each_entry(tg, &qs->groups, next) {
@@ -258,10 +259,12 @@ static void xnsched_quota_setparam(struct xnthread 
*thread,
                thread->quota = tg;
                list_add(&thread->quota_next, &tg->members);
                tg->nr_threads++;
-               return;
+               return effective;
        }
 
        XENO_BUG(COBALT);
+
+       return false;
 }
 
 static void xnsched_quota_getparam(struct xnthread *thread,
@@ -275,7 +278,7 @@ static void xnsched_quota_trackprio(struct xnthread *thread,
                                    const union xnsched_policy_param *p)
 {
        if (p) {
-               /* We should not cross groups during PIP boost. */
+               /* We should not cross groups during PI boost. */
                XENO_WARN_ON(COBALT,
                             thread->base_class == &xnsched_class_quota &&
                             thread->quota->tgid != p->quota.tgid);
@@ -284,6 +287,14 @@ static void xnsched_quota_trackprio(struct xnthread 
*thread,
                thread->cprio = thread->bprio;
 }
 
+static void xnsched_quota_protectprio(struct xnthread *thread, int prio)
+{
+       if (prio > XNSCHED_QUOTA_MAX_PRIO)
+               prio = XNSCHED_QUOTA_MAX_PRIO;
+
+       thread->cprio = prio;
+}
+
 static int xnsched_quota_declare(struct xnthread *thread,
                                 const union xnsched_policy_param *p)
 {
@@ -469,7 +480,7 @@ static void xnsched_quota_migrate(struct xnthread *thread, 
struct xnsched *sched
         * the target thread to the plain RT class.
         */
        param.rt.prio = thread->cprio;
-       xnsched_set_policy(thread, &xnsched_class_rt, &param);
+       __xnthread_set_schedparam(thread, &xnsched_class_rt, &param);
 }
 
 /**
@@ -542,7 +553,7 @@ int xnsched_quota_destroy_group(struct xnsched_quota_group 
*tg,
                /* Move group members to the rt class. */
                list_for_each_entry_safe(thread, tmp, &tg->members, quota_next) 
{
                        param.rt.prio = thread->cprio;
-                       xnsched_set_policy(thread, &xnsched_class_rt, &param);
+                       __xnthread_set_schedparam(thread, &xnsched_class_rt, 
&param);
                }
        }
 
@@ -758,6 +769,7 @@ struct xnsched_class xnsched_class_quota = {
        .sched_setparam         =       xnsched_quota_setparam,
        .sched_getparam         =       xnsched_quota_getparam,
        .sched_trackprio        =       xnsched_quota_trackprio,
+       .sched_protectprio      =       xnsched_quota_protectprio,
        .sched_declare          =       xnsched_quota_declare,
        .sched_forget           =       xnsched_quota_forget,
        .sched_kick             =       xnsched_quota_kick,
diff --git a/kernel/cobalt/sched-rt.c b/kernel/cobalt/sched-rt.c
index 1973863..114ddad 100644
--- a/kernel/cobalt/sched-rt.c
+++ b/kernel/cobalt/sched-rt.c
@@ -69,7 +69,7 @@ static void xnsched_rt_rotate(struct xnsched *sched,
 
        /*
         * In case we picked the current thread, we have to make sure
-        * not to move it back to the runnable queue if it was blocked
+        * not to move it back to the run queue if it was blocked
         * before we were called. The same goes if the current thread
         * holds the scheduler lock.
         */
@@ -86,15 +86,15 @@ void xnsched_rt_tick(struct xnsched *sched)
         * thread that neither holds the scheduler lock nor was
         * blocked before entering this callback. As the time slice is
         * exhausted for the running thread, move it back to the
-        * runnable queue at the end of its priority group.
+        * run queue at the end of its priority group.
         */
        xnsched_putback(sched->curr);
 }
 
-void xnsched_rt_setparam(struct xnthread *thread,
+bool xnsched_rt_setparam(struct xnthread *thread,
                         const union xnsched_policy_param *p)
 {
-       __xnsched_rt_setparam(thread, p);
+       return __xnsched_rt_setparam(thread, p);
 }
 
 void xnsched_rt_getparam(struct xnthread *thread,
@@ -109,6 +109,11 @@ void xnsched_rt_trackprio(struct xnthread *thread,
        __xnsched_rt_trackprio(thread, p);
 }
 
+void xnsched_rt_protectprio(struct xnthread *thread, int prio)
+{
+       __xnsched_rt_protectprio(thread, prio);
+}
+
 #ifdef CONFIG_XENO_OPT_VFILE
 
 struct xnvfile_directory sched_rt_vfroot;
@@ -239,6 +244,7 @@ struct xnsched_class xnsched_class_rt = {
        .sched_declare          =       NULL,
        .sched_setparam         =       xnsched_rt_setparam,
        .sched_trackprio        =       xnsched_rt_trackprio,
+       .sched_protectprio      =       xnsched_rt_protectprio,
        .sched_getparam         =       xnsched_rt_getparam,
 #ifdef CONFIG_XENO_OPT_VFILE
        .sched_init_vfile       =       xnsched_rt_init_vfile,
diff --git a/kernel/cobalt/sched-sporadic.c b/kernel/cobalt/sched-sporadic.c
index a101308..4b732ca 100644
--- a/kernel/cobalt/sched-sporadic.c
+++ b/kernel/cobalt/sched-sporadic.c
@@ -150,13 +150,7 @@ retry:
 
        if (pss->budget == 0)
                return;
-       /*
-        * XXX: if moving to foreground priority downgrades an
-        * undergoing PIP boost, too bad, but the design flaw is in
-        * the application which should not make a sporadic thread
-        * compete for resources with higher priority classes in the
-        * first place.
-        */
+
        if (xnthread_test_state(thread, XNHELD))
                xnthread_resume(thread, XNHELD);
        else if (thread->cprio < pss->param.normal_prio) {
@@ -229,7 +223,7 @@ static void xnsched_sporadic_init(struct xnsched *sched)
 {
        /*
         * We litterally stack the sporadic scheduler on top of the RT
-        * one, reusing its runnable queue directly. This way, RT and
+        * one, reusing its run queue directly. This way, RT and
         * sporadic threads are merged into the same runqueue and thus
         * share the same priority scale, with the addition of budget
         * management for the sporadic ones.
@@ -239,16 +233,21 @@ static void xnsched_sporadic_init(struct xnsched *sched)
 #endif
 }
 
-static void xnsched_sporadic_setparam(struct xnthread *thread,
+static bool xnsched_sporadic_setparam(struct xnthread *thread,
                                      const union xnsched_policy_param *p)
 {
        struct xnsched_sporadic_data *pss = thread->pss;
+       bool effective;
+
+       xnthread_clear_state(thread, XNWEAK);
+       effective = xnsched_set_effective_priority(thread, p->pss.current_prio);
+
        /*
         * We use the budget information to determine whether we got
         * here from one of our internal calls to
         * xnthread_set_schedparam(), in which case we don't want to
-        * update the sporadic scheduling parameters, but only set the
-        * dynamic priority of the thread.
+        * update the scheduling parameters, but only set the
+        * effective priority.
         */
        if (p->pss.init_budget > 0) {
                pss->param = p->pss;
@@ -256,14 +255,13 @@ static void xnsched_sporadic_setparam(struct xnthread 
*thread,
                pss->repl_in = 0;
                pss->repl_out = 0;
                pss->repl_pending = 0;
-               if (thread == thread->sched->curr) {
+               if (effective && thread == thread->sched->curr) {
                        xntimer_stop(&pss->drop_timer);
                        sporadic_schedule_drop(thread);
                }
        }
 
-       xnthread_clear_state(thread, XNWEAK);
-       thread->cprio = p->pss.current_prio;
+       return effective;
 }
 
 static void xnsched_sporadic_getparam(struct xnthread *thread,
@@ -282,6 +280,14 @@ static void xnsched_sporadic_trackprio(struct xnthread 
*thread,
                thread->cprio = thread->bprio;
 }
 
+static void xnsched_sporadic_protectprio(struct xnthread *thread, int prio)
+{
+       if (prio > XNSCHED_SPORADIC_MAX_PRIO)
+               prio = XNSCHED_SPORADIC_MAX_PRIO;
+
+       thread->cprio = prio;
+}
+
 static int xnsched_sporadic_declare(struct xnthread *thread,
                                    const union xnsched_policy_param *p)
 {
@@ -530,6 +536,7 @@ struct xnsched_class xnsched_class_sporadic = {
        .sched_setparam         =       xnsched_sporadic_setparam,
        .sched_getparam         =       xnsched_sporadic_getparam,
        .sched_trackprio        =       xnsched_sporadic_trackprio,
+       .sched_protectprio      =       xnsched_sporadic_protectprio,
        .sched_declare          =       xnsched_sporadic_declare,
        .sched_forget           =       xnsched_sporadic_forget,
        .sched_kick             =       NULL,
diff --git a/kernel/cobalt/sched-tp.c b/kernel/cobalt/sched-tp.c
index 1c70092..9e9dc0d 100644
--- a/kernel/cobalt/sched-tp.c
+++ b/kernel/cobalt/sched-tp.c
@@ -107,14 +107,15 @@ static void xnsched_tp_init(struct xnsched *sched)
        xntimer_set_name(&tp->tf_timer, timer_name);
 }
 
-static void xnsched_tp_setparam(struct xnthread *thread,
+static bool xnsched_tp_setparam(struct xnthread *thread,
                                const union xnsched_policy_param *p)
 {
        struct xnsched *sched = thread->sched;
 
-       xnthread_clear_state(thread, XNWEAK);
        thread->tps = &sched->tp.partitions[p->tp.ptid];
-       thread->cprio = p->tp.prio;
+       xnthread_clear_state(thread, XNWEAK);
+
+       return xnsched_set_effective_priority(thread, p->tp.prio);
 }
 
 static void xnsched_tp_getparam(struct xnthread *thread,
@@ -128,22 +129,22 @@ static void xnsched_tp_trackprio(struct xnthread *thread,
                                 const union xnsched_policy_param *p)
 {
        /*
-        * The assigned partition never changes internally due to PIP
+        * The assigned partition never changes internally due to PI
         * (see xnsched_track_policy), since this would be pretty
         * wrong with respect to TP scheduling: i.e. we may not allow
         * a thread from another partition to consume CPU time from
-        * the current one, despite this would help enforcing PIP
-        * (*). In any case, introducing resource contention between
+        * the current one, despite this would help enforcing PI (see
+        * note). In any case, introducing resource contention between
         * threads that belong to different partitions is utterly
         * wrong in the first place.  Only an explicit call to
         * xnsched_set_policy() may change the partition assigned to a
         * thread. For that reason, a policy reset action only boils
         * down to reinstating the base priority.
         *
-        * (*) However, we do allow threads from lower scheduling
-        * classes to consume CPU time from the current window as a
-        * result of a PIP boost, since this is aimed at speeding up
-        * the release of a synchronization object a TP thread needs.
+        * NOTE: we do allow threads from lower scheduling classes to
+        * consume CPU time from the current window as a result of a
+        * PI boost, since this is aimed at speeding up the release of
+        * a synchronization object a TP thread needs.
         */
        if (p) {
                /* We should never cross partition boundaries. */
@@ -155,6 +156,14 @@ static void xnsched_tp_trackprio(struct xnthread *thread,
                thread->cprio = thread->bprio;
 }
 
+static void xnsched_tp_protectprio(struct xnthread *thread, int prio)
+{
+       if (prio > XNSCHED_TP_MAX_PRIO)
+               prio = XNSCHED_TP_MAX_PRIO;
+
+       thread->cprio = prio;
+}
+
 static int xnsched_tp_declare(struct xnthread *thread,
                              const union xnsched_policy_param *p)
 {
@@ -209,12 +218,12 @@ static void xnsched_tp_migrate(struct xnthread *thread, 
struct xnsched *sched)
         * it cannot apply to a thread that moves to another CPU
         * anymore. So we upgrade that thread to the RT class when a
         * CPU migration occurs. A subsequent call to
-        * xnsched_set_policy() may move it back to TP scheduling,
-        * with a partition assignment that fits the remote CPU's
-        * partition schedule.
+        * __xnthread_set_schedparam() may move it back to TP
+        * scheduling, with a partition assignment that fits the
+        * remote CPU's partition schedule.
         */
        param.rt.prio = thread->cprio;
-       xnsched_set_policy(thread, &xnsched_class_rt, &param);
+       __xnthread_set_schedparam(thread, &xnsched_class_rt, &param);
 }
 
 void xnsched_tp_start_schedule(struct xnsched *sched)
@@ -255,14 +264,14 @@ xnsched_tp_set_schedule(struct xnsched *sched,
 
        /*
         * Move all TP threads on this scheduler to the RT class,
-        * until we call xnsched_set_policy() for them again.
+        * until we call __xnthread_set_schedparam() for them again.
         */
        if (list_empty(&tp->threads))
                goto done;
 
        list_for_each_entry_safe(thread, tmp, &tp->threads, tp_link) {
                param.rt.prio = thread->cprio;
-               xnsched_set_policy(thread, &xnsched_class_rt, &param);
+               __xnthread_set_schedparam(thread, &xnsched_class_rt, &param);
        }
 done:
        old_gps = tp->gps;
@@ -429,6 +438,7 @@ struct xnsched_class xnsched_class_tp = {
        .sched_setparam         =       xnsched_tp_setparam,
        .sched_getparam         =       xnsched_tp_getparam,
        .sched_trackprio        =       xnsched_tp_trackprio,
+       .sched_protectprio      =       xnsched_tp_protectprio,
        .sched_declare          =       xnsched_tp_declare,
        .sched_forget           =       xnsched_tp_forget,
        .sched_kick             =       NULL,
diff --git a/kernel/cobalt/sched-weak.c b/kernel/cobalt/sched-weak.c
index 64218d1..fc778b8 100644
--- a/kernel/cobalt/sched-weak.c
+++ b/kernel/cobalt/sched-weak.c
@@ -44,12 +44,13 @@ static struct xnthread *xnsched_weak_pick(struct xnsched 
*sched)
        return xnsched_getq(&sched->weak.runnable);
 }
 
-void xnsched_weak_setparam(struct xnthread *thread,
+bool xnsched_weak_setparam(struct xnthread *thread,
                           const union xnsched_policy_param *p)
 {
-       thread->cprio = p->weak.prio;
        if (!xnthread_test_state(thread, XNBOOST))
                xnthread_set_state(thread, XNWEAK);
+
+       return xnsched_set_effective_priority(thread, p->weak.prio);
 }
 
 void xnsched_weak_getparam(struct xnthread *thread,
@@ -62,11 +63,19 @@ void xnsched_weak_trackprio(struct xnthread *thread,
                            const union xnsched_policy_param *p)
 {
        if (p)
-               xnsched_weak_setparam(thread, p);
+               thread->cprio = p->weak.prio;
        else
                thread->cprio = thread->bprio;
 }
 
+void xnsched_weak_protectprio(struct xnthread *thread, int prio)
+{
+       if (prio > XNSCHED_WEAK_MAX_PRIO)
+               prio = XNSCHED_WEAK_MAX_PRIO;
+
+       thread->cprio = prio;
+}
+
 static int xnsched_weak_declare(struct xnthread *thread,
                                const union xnsched_policy_param *p)
 {
@@ -202,6 +211,7 @@ struct xnsched_class xnsched_class_weak = {
        .sched_declare          =       xnsched_weak_declare,
        .sched_setparam         =       xnsched_weak_setparam,
        .sched_trackprio        =       xnsched_weak_trackprio,
+       .sched_protectprio      =       xnsched_weak_protectprio,
        .sched_getparam         =       xnsched_weak_getparam,
 #ifdef CONFIG_XENO_OPT_VFILE
        .sched_init_vfile       =       xnsched_weak_init_vfile,
diff --git a/kernel/cobalt/sched.c b/kernel/cobalt/sched.c
index 7cb91a5..ee575d4 100644
--- a/kernel/cobalt/sched.c
+++ b/kernel/cobalt/sched.c
@@ -258,8 +258,8 @@ struct xnthread *xnsched_pick_next(struct xnsched *sched)
                        return curr;
                }
                /*
-                * Push the current thread back to the runnable queue
-                * of the scheduling class it belongs to, if not yet
+                * Push the current thread back to the run queue of
+                * the scheduling class it belongs to, if not yet
                 * linked to it (XNREADY tells us if it is).
                 */
                if (!xnthread_test_state(curr, XNREADY)) {
@@ -378,7 +378,7 @@ void xnsched_unlock(void)
 }
 EXPORT_SYMBOL_GPL(xnsched_unlock);
 
-/* Must be called with nklock locked, interrupts off. */
+/* nklock locked, interrupts off. */
 void xnsched_putback(struct xnthread *thread)
 {
        if (xnthread_test_state(thread, XNREADY))
@@ -390,11 +390,13 @@ void xnsched_putback(struct xnthread *thread)
        xnsched_set_resched(thread->sched);
 }
 
-/* Must be called with nklock locked, interrupts off. */
+/* nklock locked, interrupts off. */
 int xnsched_set_policy(struct xnthread *thread,
                       struct xnsched_class *sched_class,
                       const union xnsched_policy_param *p)
 {
+       struct xnsched_class *orig_effective_class __maybe_unused;
+       bool effective;
        int ret;
 
        /*
@@ -423,11 +425,32 @@ int xnsched_set_policy(struct xnthread *thread,
                        xnsched_forget(thread);
        }
 
-       thread->sched_class = sched_class;
+       /*
+        * Set the base and effective scheduling parameters. However,
+        * xnsched_setparam() will deny lowering the effective
+        * priority if a boost is undergoing, only recording the
+        * change into the base priority field in such situation.
+        */
        thread->base_class = sched_class;
-       xnsched_setparam(thread, p);
-       thread->bprio = thread->cprio;
-       thread->wprio = thread->cprio + sched_class->weight;
+       /*
+        * Referring to the effective class from a setparam() handler
+        * is wrong: make sure to break if so.
+        */
+       if (XENO_DEBUG(COBALT)) {
+               orig_effective_class = thread->sched_class;
+               thread->sched_class = NULL;
+       }
+
+       /*
+        * This is the ONLY place where calling xnsched_setparam() is
+        * legit, sane and safe.
+        */
+       effective = xnsched_setparam(thread, p);
+       if (effective) {
+               thread->sched_class = sched_class;
+               thread->wprio = xnsched_calc_wprio(sched_class, thread->cprio);
+       } else if (XENO_DEBUG(COBALT))
+               thread->sched_class = orig_effective_class;
 
        if (xnthread_test_state(thread, XNREADY))
                xnsched_enqueue(thread);
@@ -439,28 +462,91 @@ int xnsched_set_policy(struct xnthread *thread,
 }
 EXPORT_SYMBOL_GPL(xnsched_set_policy);
 
-/* Must be called with nklock locked, interrupts off. */
+/* nklock locked, interrupts off. */
+bool xnsched_set_effective_priority(struct xnthread *thread, int prio)
+{
+       int wprio = xnsched_calc_wprio(thread->base_class, prio);
+
+       thread->bprio = prio;
+       if (wprio == thread->wprio)
+               return true;
+
+       /*
+        * We may not lower the effective/current priority of a
+        * boosted thread when changing the base scheduling
+        * parameters. Only xnsched_track_policy() and
+        * xnsched_protect_priority() may do so when dealing with PI
+        * and PP synchs resp.
+        */
+       if (wprio < thread->wprio && xnthread_test_state(thread, XNBOOST))
+               return false;
+
+       thread->cprio = prio;
+
+       return true;
+}
+
+/* nklock locked, interrupts off. */
 void xnsched_track_policy(struct xnthread *thread,
                          struct xnthread *target)
 {
        union xnsched_policy_param param;
 
+       /*
+        * Inherit (or reset) the effective scheduling class and
+        * priority of a thread. Unlike xnsched_set_policy(), this
+        * routine is allowed to lower the weighted priority with no
+        * restriction, even if a boost is undergoing.
+        */
        if (xnthread_test_state(thread, XNREADY))
                xnsched_dequeue(thread);
        /*
         * Self-targeting means to reset the scheduling policy and
-        * parameters to the base ones. Otherwise, make thread inherit
-        * the scheduling data from target.
+        * parameters to the base settings. Otherwise, make thread
+        * inherit the scheduling parameters from target.
         */
        if (target == thread) {
                thread->sched_class = thread->base_class;
                xnsched_trackprio(thread, NULL);
+               /*
+                * Per SuSv2, resetting the base scheduling parameters
+                * should not move the thread to the tail of its
+                * priority group.
+                */
+               if (xnthread_test_state(thread, XNREADY))
+                       xnsched_requeue(thread);
+
        } else {
                xnsched_getparam(target, &param);
                thread->sched_class = target->sched_class;
                xnsched_trackprio(thread, &param);
+               if (xnthread_test_state(thread, XNREADY))
+                       xnsched_enqueue(thread);
        }
 
+       xnsched_set_resched(thread->sched);
+}
+
+/* nklock locked, interrupts off. */
+void xnsched_protect_priority(struct xnthread *thread, int prio)
+{
+       /*
+        * Apply a PP boost by changing the effective priority of a
+        * thread, forcing it to the RT class. Like
+        * xnsched_track_policy(), this routine is allowed to lower
+        * the weighted priority with no restriction, even if a boost
+        * is undergoing.
+        *
+        * This routine only deals with active boosts, resetting the
+        * base priority when leaving a PP boost is obtained by a call
+        * to xnsched_track_policy().
+        */
+       if (xnthread_test_state(thread, XNREADY))
+               xnsched_dequeue(thread);
+
+       thread->sched_class = &xnsched_class_rt;
+       xnsched_protectprio(thread, prio);
+
        if (xnthread_test_state(thread, XNREADY))
                xnsched_enqueue(thread);
 
@@ -486,8 +572,7 @@ static void migrate_thread(struct xnthread *thread, struct 
xnsched *sched)
 }
 
 /*
- * Must be called with nklock locked, interrupts off. thread must be
- * runnable.
+ * nklock locked, interrupts off. thread must be runnable.
  */
 void xnsched_migrate(struct xnthread *thread, struct xnsched *sched)
 {
@@ -501,14 +586,13 @@ void xnsched_migrate(struct xnthread *thread, struct 
xnsched *sched)
         */
        xnthread_set_state(thread, XNMIGRATE);
 #else /* !CONFIG_XENO_ARCH_UNLOCKED_SWITCH */
-       /* Move thread to the remote runnable queue. */
+       /* Move thread to the remote run queue. */
        xnsched_putback(thread);
 #endif /* !CONFIG_XENO_ARCH_UNLOCKED_SWITCH */
 }
 
 /*
- * Must be called with nklock locked, interrupts off. Thread may be
- * blocked.
+ * nklock locked, interrupts off. Thread may be blocked.
  */
 void xnsched_migrate_passive(struct xnthread *thread, struct xnsched *sched)
 {
@@ -652,7 +736,7 @@ struct xnthread *xnsched_rt_pick(struct xnsched *sched)
        /*
         * The active class (i.e. ->sched_class) is the one currently
         * queuing the thread, reflecting any priority boost due to
-        * PIP.
+        * PI.
         */
        thread = list_first_entry(head, struct xnthread, rlink);
        if (unlikely(thread->sched_class != &xnsched_class_rt))
@@ -818,6 +902,11 @@ void __xnsched_run_handler(void) /* hw interrupts off. */
        xnsched_run();
 }
 
+static inline void do_lazy_user_work(struct xnthread *curr)
+{
+       xnthread_commit_ceiling(curr);
+}
+
 int ___xnsched_run(struct xnsched *sched)
 {
        struct xnthread *prev, *next, *curr;
@@ -840,6 +929,9 @@ int ___xnsched_run(struct xnsched *sched)
         */
        xntrace_pid(task_pid_nr(current), xnthread_current_priority(curr));
 reschedule:
+       if (xnthread_test_state(curr, XNUSER))
+               do_lazy_user_work(curr);
+
        switched = 0;
        if (!test_resched(sched))
                goto out;
diff --git a/kernel/cobalt/synch.c b/kernel/cobalt/synch.c
index 7773a08..976261d 100644
--- a/kernel/cobalt/synch.c
+++ b/kernel/cobalt/synch.c
@@ -25,6 +25,20 @@
 #include <cobalt/uapi/signal.h>
 #include <trace/events/cobalt-core.h>
 
+#define PP_CEILING_MASK 0xff
+
+static inline int get_ceiling_value(struct xnsynch *synch)
+{
+       /*
+        * The ceiling priority value is stored in user-writable
+        * memory, make sure to constrain it within valid bounds for
+        * xnsched_class_rt before using it.
+        */
+       return *synch->ceiling_ref & PP_CEILING_MASK ?: 1;
+}
+
+struct xnsynch *lookup_lazy_pp(xnhandle_t handle);
+
 /**
  * @ingroup cobalt_core
  * @defgroup cobalt_core_synch Thread synchronization services
@@ -32,19 +46,16 @@
  */
 
 /**
- * @fn void xnsynch_init(struct xnsynch *synch, int flags,
- *                       atomic_t *fastlock)
- *
  * @brief Initialize a synchronization object.
  *
  * Initializes a synchronization object. Xenomai threads can wait on
  * and signal such objects for serializing access to resources.
  * This object has built-in support for priority inheritance.
  *
- * @param synch The address of a synchronization object descriptor the
- * nucleus will use to store the object-specific data.  This
- * descriptor must always be valid while the object is active
- * therefore it must be allocated in permanent memory.
+ * @param synch The address of a synchronization object descriptor
+ * Cobalt will use to store the object-specific data.  This descriptor
+ * must always be valid while the object is active therefore it must
+ * be allocated in permanent memory.
  *
  * @param flags A set of creation flags affecting the operation. The
  * valid flags are:
@@ -58,18 +69,16 @@
  * xnsynch_acquire() and xnsynch_release() instead of
  * xnsynch_sleep_on() and xnsynch_wakeup_*().
  *
- * - XNSYNCH_PIP enables priority inheritance when a priority
- * inversion is detected among threads using this object.  XNSYNCH_PIP
- * enables XNSYNCH_OWNER and XNSYNCH_PRIO implicitly.
+ * - XNSYNCH_PI enables priority inheritance when a priority inversion
+ * is detected among threads using this object.  XNSYNCH_PI implies
+ * XNSYNCH_OWNER and XNSYNCH_PRIO.
+ *
+ * - XNSYNCH_PP enables priority protect to prevent priority inversion.
+ * XNSYNCH_PP implies XNSYNCH_OWNER and XNSYNCH_PRIO.
  *
- * - XNSYNCH_DREORD (Disable REORDering) tells the nucleus that the
- * wait queue should not be reordered whenever the priority of a
- * blocked thread it holds is changed. If this flag is not specified,
- * changing the priority of a blocked thread using
- * xnthread_set_schedparam() will cause this object's wait queue to be
- * reordered according to the new priority level, provided the
- * synchronization object makes the waiters wait by priority order on
- * the awaited resource (XNSYNCH_PRIO).
+ * - XNSYNCH_DREORD (Disable REORDering) tells Cobalt not to reorder
+ * the wait list upon priority change of a waiter. Reordering is the
+ * default. Only applies when XNSYNCH_PRIO is present.
  *
  * @param fastlock Address of the fast lock word to be associated with
  * a synchronization object with ownership tracking. Therefore, a
@@ -80,13 +89,14 @@
  */
 void xnsynch_init(struct xnsynch *synch, int flags, atomic_t *fastlock)
 {
-       if (flags & XNSYNCH_PIP)
+       if (flags & (XNSYNCH_PI|XNSYNCH_PP))
                flags |= XNSYNCH_PRIO | XNSYNCH_OWNER;  /* Obviously... */
 
        synch->status = flags & ~XNSYNCH_CLAIMED;
        synch->owner = NULL;
-       synch->cleanup = NULL;  /* Only works for PIP-enabled objects. */
+       synch->cleanup = NULL;  /* for PI/PP only. */
        synch->wprio = -1;
+       synch->ceiling_ref = NULL;
        INIT_LIST_HEAD(&synch->pendq);
 
        if (flags & XNSYNCH_OWNER) {
@@ -99,6 +109,35 @@ void xnsynch_init(struct xnsynch *synch, int flags, 
atomic_t *fastlock)
 EXPORT_SYMBOL_GPL(xnsynch_init);
 
 /**
+ * @brief Initialize a synchronization object enforcing PP.
+ *
+ * This call is a variant of xnsynch_init() for initializing
+ * synchronization objects enabling the priority protect protocol.
+ *
+ * @param synch The address of a synchronization object descriptor
+ * Cobalt will use to store the object-specific data.  See
+ * xnsynch_init().
+ *
+ * @param flags A set of creation flags affecting the operation. See
+ * xnsynch_init(). XNSYNCH_PI is mutually exclusive with XNSYNCH_PP,
+ * and won't be considered.
+ *
+ * @param fastlock Address of the fast lock word to be associated with
+ * a synchronization object with ownership tracking. See xnsynch_init().
+ *
+ * @param ceiling_ref The address of the variable holding the current
+ * priority ceiling value for this object.
+ *
+ * @coretags{task-unrestricted}
+ */
+void xnsynch_init_protect(struct xnsynch *synch, int flags,
+                         atomic_t *fastlock, u32 *ceiling_ref)
+{
+       xnsynch_init(synch, (flags & ~XNSYNCH_PI) | XNSYNCH_PP, fastlock);
+       synch->ceiling_ref = ceiling_ref;
+}
+
+/**
  * @fn void xnsynch_destroy(struct xnsynch *synch)
  * @brief Destroy a synchronization object.
  *
@@ -306,29 +345,224 @@ void xnsynch_wakeup_this_sleeper(struct xnsynch *synch, 
struct xnthread *sleeper
 }
 EXPORT_SYMBOL_GPL(xnsynch_wakeup_this_sleeper);
 
-/*
- * @internal
- *
- * This service is used by the PIP code to raise/lower a thread's
- * priority. The base priority value is _not_ changed and if ready,
- * the thread is always moved at the end of its priority group.
- *
- * @note There is no point in propagating Xenomai policy/priority
- * changes to linux/libc, since doing so would be papering over a
- * basic priority inversion issue in the application code. I.e. a
- * Xenomai mutex owner shall NOT enter secondary mode until it
- * eventually drops the resource - this is even triggering a debug
- * signal-, so there is no point in boosting the scheduling
- * policy/priority settings applicable to that mode anyway.
- */
-static void xnsynch_renice_thread(struct xnthread *thread,
-                                 struct xnthread *target)
+static inline void raise_boost_flag(struct xnthread *owner)
 {
+       /* Backup the base priority at first boost only. */
+       if (!xnthread_test_state(owner, XNBOOST)) {
+               owner->bprio = owner->cprio;
+               xnthread_set_state(owner, XNBOOST);
+       }
+}
+
+static void inherit_thread_priority(struct xnthread *owner,
+                                   struct xnthread *target)
+{
+       if (xnthread_test_state(owner, XNZOMBIE))
+               return;
+       
        /* Apply the scheduling policy of "target" to "thread" */
-       xnsched_track_policy(thread, target);
+       xnsched_track_policy(owner, target);
+
+       /*
+        * Owner may be sleeping, propagate priority update through
+        * the PI chain if needed.
+        */
+       if (owner->wchan)
+               xnsynch_requeue_sleeper(owner);
+}
+
+static void __ceil_owner_priority(struct xnthread *owner, int prio)
+{
+       if (xnthread_test_state(owner, XNZOMBIE))
+               return;
+       /*
+        * Raise owner priority to the ceiling value, this implicitly
+        * selects SCHED_FIFO for the owner.
+        */
+       xnsched_protect_priority(owner, prio);
+
+       if (owner->wchan)
+               xnsynch_requeue_sleeper(owner);
+}
+
+static void adjust_boost(struct xnthread *owner, struct xnthread *target)
+{
+       struct xnsynch *synch;
+
+       /*
+        * CAUTION: we may have PI and PP-enabled objects among the
+        * boosters, considering the leader of synch->pendq is
+        * therefore NOT enough for determining the next boost
+        * priority, since PP is tracked on acquisition, not on
+        * contention. Check the head of the booster list instead.
+        */
+       synch = list_first_entry(&owner->boosters, struct xnsynch, next);
+       if (synch->wprio == owner->wprio)
+               return;
+       
+       if (synch->status & XNSYNCH_PP)
+               __ceil_owner_priority(owner, get_ceiling_value(synch));
+       else {
+               XENO_BUG_ON(COBALT, list_empty(&synch->pendq));
+               if (target == NULL)
+                       target = list_first_entry(&synch->pendq,
+                                                 struct xnthread, plink);
+               inherit_thread_priority(owner, target);
+       }
+}
+
+static void ceil_owner_priority(struct xnsynch *synch)
+{
+       struct xnthread *owner = synch->owner;
+       int wprio;
+
+       /* PP ceiling values are implicitly based on the RT class. */
+       wprio = xnsched_calc_wprio(&xnsched_class_rt,
+                                  get_ceiling_value(synch));
+       synch->wprio = wprio;
+       list_add_priff(synch, &owner->boosters, wprio, next);
+       raise_boost_flag(owner);
+       synch->status |= XNSYNCH_CEILING;
+
+       /*
+        * If the ceiling value is lower than the current effective
+        * priority, we must not adjust the latter.  BEWARE: not only
+        * this restriction is required to keep the PP logic right,
+        * but this is also a basic assumption made by all
+        * xnthread_commit_ceiling() callers which won't check for any
+        * rescheduling opportunity upon return.
+        *
+        * However we do want the object to be linked to the booster
+        * list, and XNBOOST must appear in the current thread status.
+        *
+        * This way, setparam() won't be allowed to decrease the
+        * current weighted priority below the ceiling value, until we
+        * eventually release this object.
+        */
+       if (wprio > owner->wprio)
+               adjust_boost(owner, NULL);
+}
+
+static inline
+void track_owner(struct xnsynch *synch, struct xnthread *owner)
+{
+       synch->owner = owner;
+}
+
+static inline  /* nklock held, irqs off */
+void set_current_owner_locked(struct xnsynch *synch, struct xnthread *owner)
+{
+       /*
+        * Update the owner information, and apply priority protection
+        * for PP objects. We may only get there if owner is current,
+        * or blocked.
+        */
+       track_owner(synch, owner);
+       if (synch->status & XNSYNCH_PP)
+               ceil_owner_priority(synch);
+}
+
+static inline
+void set_current_owner(struct xnsynch *synch, struct xnthread *owner)
+{
+       spl_t s;
+
+       track_owner(synch, owner);
+       if (synch->status & XNSYNCH_PP) {
+               xnlock_get_irqsave(&nklock, s);
+               ceil_owner_priority(synch);
+               xnlock_put_irqrestore(&nklock, s);
+       }
+}
+
+static inline
+xnhandle_t get_owner_handle(xnhandle_t ownerh, struct xnsynch *synch)
+{
+       /*
+        * On acquisition from kernel space, the fast lock handle
+        * should bear the FLCEIL bit for PP objects, so that userland
+        * takes the slow path on release, jumping to the kernel for
+        * dropping the ceiling priority boost.
+        */
+       if (synch->status & XNSYNCH_PP)
+               ownerh = xnsynch_fast_ceiling(ownerh);
+
+       return ownerh;
+}
+
+static void commit_ceiling(struct xnsynch *synch, struct xnthread *curr)
+{
+       xnhandle_t oldh, h;
+       atomic_t *lockp;
+
+       track_owner(synch, curr);
+       ceil_owner_priority(synch);
+       /*
+        * Raise FLCEIL, which indicates a kernel entry will be
+        * required for releasing this resource.
+        */
+       lockp = xnsynch_fastlock(synch);
+       do {
+               h = atomic_read(lockp);
+               oldh = atomic_cmpxchg(lockp, h, xnsynch_fast_ceiling(h));
+       } while (oldh != h);
+}
+
+void xnsynch_commit_ceiling(struct xnthread *curr)  /* nklock held, irqs off */
+{
+       struct xnsynch *synch;
+       atomic_t *lockp;
 
-       if (thread->wchan)
-               xnsynch_requeue_sleeper(thread);
+       /* curr->u_window has to be valid, curr bears XNUSER. */
+       synch = lookup_lazy_pp(curr->u_window->pp_pending);
+       if (synch == NULL) {
+               /*
+                * If pp_pending is a bad handle, don't panic but
+                * rather ignore: we don't want a misbehaving userland
+                * to crash the kernel.
+                */
+               XENO_WARN_ON_ONCE(USER, 1);
+               goto out;
+       }
+
+       /*
+        * For PP locks, userland does, in that order:
+        *
+        * -- LOCK
+        * 1. curr->u_window->pp_pending = lock_handle
+        *    barrier();
+        * 2. atomic_cmpxchg(lockp, XN_NO_HANDLE, curr->handle);
+        *
+        * -- UNLOCK
+        * 1. atomic_cmpxchg(lockp, curr->handle, XN_NO_HANDLE); [unclaimed]
+        *    barrier();
+        * 2. curr->u_window->pp_pending = XN_NO_HANDLE
+        *
+        * Make sure we have not been caught in a rescheduling in
+        * between those steps. If we did, then we won't be holding
+        * the lock as we schedule away, therefore no priority update
+        * must take place.
+        */
+       lockp = xnsynch_fastlock(synch);
+       if (xnsynch_fast_owner_check(lockp, curr->handle))
+               return;
+
+       /*
+        * In rare cases, we could be called multiple times for
+        * committing a lazy ceiling for the same object, e.g. if
+        * userland is preempted in the middle of a recursive locking
+        * sequence.
+        *
+        * This stems from the fact that userland has to update
+        * ->pp_pending prior to trying to grab the lock atomically,
+        * at which point it can figure out whether a recursive
+        * locking happened. We get out of this trap by testing the
+        * XNSYNCH_CEILING flag.
+        */
+       if ((synch->status & XNSYNCH_CEILING) == 0)
+               commit_ceiling(synch, curr);
+out:
+       curr->u_window->pp_pending = XN_NO_HANDLE;
 }
 
 /**
@@ -371,12 +605,13 @@ int xnsynch_try_acquire(struct xnsynch *synch)
        lockp = xnsynch_fastlock(synch);
        trace_cobalt_synch_try_acquire(synch, curr);
 
-       h = atomic_cmpxchg(lockp, XN_NO_HANDLE, curr->handle);
+       h = atomic_cmpxchg(lockp, XN_NO_HANDLE,
+                          get_owner_handle(curr->handle, synch));
        if (h != XN_NO_HANDLE)
                return xnhandle_get_id(h) == curr->handle ?
                        -EDEADLK : -EBUSY;
 
-       xnsynch_set_owner(synch, curr);
+       set_current_owner(synch, curr);
        xnthread_get_resource(curr);
 
        return 0;
@@ -439,9 +674,10 @@ int xnsynch_acquire(struct xnsynch *synch, xnticks_t 
timeout,
        trace_cobalt_synch_acquire(synch, curr);
 redo:
        /* Basic form of xnsynch_try_acquire(). */
-       h = atomic_cmpxchg(lockp, XN_NO_HANDLE, currh);
+       h = atomic_cmpxchg(lockp, XN_NO_HANDLE,
+                          get_owner_handle(currh, synch));
        if (likely(h == XN_NO_HANDLE)) {
-               xnsynch_set_owner(synch, curr);
+               set_current_owner(synch, curr);
                xnthread_get_resource(curr);
                return 0;
        }
@@ -483,7 +719,24 @@ redo:
                goto out;
        }
 
-       xnsynch_set_owner(synch, owner);
+       /*
+        * This is the contended path. We just detected an earlier
+        * syscall-less fast locking from userland, fix up the
+        * in-kernel state information accordingly.
+        *
+        * The consistency of the state information is guaranteed,
+        * because we just raised the claim bit atomically for this
+        * contended lock, therefore userland will have to jump to the
+        * kernel when releasing it, instead of doing a fast
+        * unlock. Since we currently own the superlock, consistency
+        * wrt transfer_ownership() is guaranteed through
+        * serialization.
+        *
+        * CAUTION: in this particular case, the only assumptions we
+        * can safely make is that *owner is valid but not current on
+        * this CPU.
+        */
+       track_owner(synch, owner);
        xnsynch_detect_relaxed_owner(synch, curr);
 
        if ((synch->status & XNSYNCH_PRIO) == 0) { /* i.e. FIFO */
@@ -494,7 +747,7 @@ redo:
        if (curr->wprio > owner->wprio) {
                if (xnthread_test_info(owner, XNWAKEN) && owner->wwake == 
synch) {
                        /* Ownership is still pending, steal the resource. */
-                       synch->owner = curr;
+                       set_current_owner_locked(synch, curr);
                        xnthread_clear_info(curr, XNRMID | XNTIMEO | XNBREAK);
                        xnthread_set_info(owner, XNROBBED);
                        goto grab;
@@ -502,20 +755,25 @@ redo:
 
                list_add_priff(curr, &synch->pendq, wprio, plink);
 
-               if (synch->status & XNSYNCH_PIP) {
-                       if (!xnthread_test_state(owner, XNBOOST)) {
-                               owner->bprio = owner->cprio;
-                               xnthread_set_state(owner, XNBOOST);
-                       }
+               if (synch->status & XNSYNCH_PI) {
+                       raise_boost_flag(owner);
 
                        if (synch->status & XNSYNCH_CLAIMED)
-                               list_del(&synch->link);
+                               list_del(&synch->next); /* owner->boosters */
                        else
                                synch->status |= XNSYNCH_CLAIMED;
 
                        synch->wprio = curr->wprio;
-                       list_add_priff(synch, &owner->claimq, wprio, link);
-                       xnsynch_renice_thread(owner, curr);
+                       list_add_priff(synch, &owner->boosters, wprio, next);
+                       /*
+                        * curr->wprio > owner->wprio implies that
+                        * synch must be leading the booster list
+                        * after insertion, so we may call
+                        * inherit_thread_priority() for tracking
+                        * current's priority directly without going
+                        * through adjust_boost().
+                        */
+                       inherit_thread_priority(owner, curr);
                }
        } else
                list_add_priff(curr, &synch->pendq, wprio, plink);
@@ -545,14 +803,14 @@ block:
                xnthread_set_info(curr, XNTIMEO);
                goto out;
        }
- grab:
+grab:
        xnthread_get_resource(curr);
 
        if (xnsynch_pended_p(synch))
                currh = xnsynch_fast_claimed(currh);
 
-       /* Set new ownership for this mutex. */
-       atomic_set(lockp, currh);
+       /* Set new ownership for this object. */
+       atomic_set(lockp, get_owner_handle(currh, synch));
 out:
        xnlock_put_irqrestore(&nklock, s);
 
@@ -560,51 +818,47 @@ out:
 }
 EXPORT_SYMBOL_GPL(xnsynch_acquire);
 
-static void clear_boost(struct xnsynch *synch, struct xnthread *owner)
+static void drop_booster(struct xnsynch *synch, struct xnthread *owner)
 {
-       struct xnthread *target;
-       struct xnsynch *hsynch;
-       int wprio;
-
-       list_del(&synch->link);
-       synch->status &= ~XNSYNCH_CLAIMED;
-       wprio = owner->bprio + owner->sched_class->weight;
+       list_del(&synch->next); /* owner->boosters */
 
-       if (list_empty(&owner->claimq)) {
+       if (list_empty(&owner->boosters)) {
                xnthread_clear_state(owner, XNBOOST);
-               target = owner;
-       } else {
-               /* Find the highest priority needed to enforce the PIP. */
-               hsynch = list_first_entry(&owner->claimq, struct xnsynch, link);
-               XENO_BUG_ON(COBALT, list_empty(&hsynch->pendq));
-               target = list_first_entry(&hsynch->pendq, struct xnthread, 
plink);
-               if (target->wprio > wprio)
-                       wprio = target->wprio;
-               else
-                       target = owner;
-       }
+               inherit_thread_priority(owner, owner);
+       } else
+               adjust_boost(owner, NULL);
+}
 
-       if (owner->wprio != wprio &&
-           !xnthread_test_state(owner, XNZOMBIE))
-               xnsynch_renice_thread(owner, target);
+static inline void clear_pi_boost(struct xnsynch *synch,
+                                 struct xnthread *owner)
+{      /* nklock held, irqs off */
+       synch->status &= ~XNSYNCH_CLAIMED;
+       drop_booster(synch, owner);
+}
+
+static inline void clear_pp_boost(struct xnsynch *synch,
+                                 struct xnthread *owner)
+{      /* nklock held, irqs off */
+       synch->status &= ~XNSYNCH_CEILING;
+       drop_booster(synch, owner);
 }
 
 static struct xnthread *transfer_ownership(struct xnsynch *synch,
                                           struct xnthread *lastowner)
-{
+{                              /* nklock held, irqs off */
        struct xnthread *nextowner;
        xnhandle_t nextownerh;
        atomic_t *lockp;
-       spl_t s;
-
-       xnlock_get_irqsave(&nklock, s);
 
        lockp = xnsynch_fastlock(synch);
 
+       /*
+        * Our caller checked for contention locklessly, so we do have
+        * to check again under lock in a different way.
+        */
        if (list_empty(&synch->pendq)) {
                synch->owner = NULL;
                atomic_set(lockp, XN_NO_HANDLE);
-               xnlock_put_irqrestore(&nklock, s);
                return NULL;
        }
 
@@ -612,26 +866,25 @@ static struct xnthread *transfer_ownership(struct xnsynch 
*synch,
        list_del(&nextowner->plink);
        nextowner->wchan = NULL;
        nextowner->wwake = synch;
-       synch->owner = nextowner;
+       set_current_owner_locked(synch, nextowner);
        xnthread_set_info(nextowner, XNWAKEN);
        xnthread_resume(nextowner, XNPEND);
 
        if (synch->status & XNSYNCH_CLAIMED)
-               clear_boost(synch, lastowner);
+               clear_pi_boost(synch, lastowner);
 
-       nextownerh = xnsynch_pended_p(synch) ?
-               xnsynch_fast_claimed(nextowner->handle) :
-               xnsynch_fast_not_claimed(nextowner->handle);
-       atomic_set(lockp, nextownerh);
+       nextownerh = get_owner_handle(nextowner->handle, synch);
+       if (xnsynch_pended_p(synch))
+               nextownerh = xnsynch_fast_claimed(nextownerh);
 
-       xnlock_put_irqrestore(&nklock, s);
+       atomic_set(lockp, nextownerh);
 
        return nextowner;
 }
 
 /**
- * @fn struct xnthread *xnsynch_release(struct xnsynch *synch, struct xnthread 
*thread)
- * @brief Give the resource ownership to the next waiting thread.
+ * @fn struct xnthread *xnsynch_release(struct xnsynch *synch, struct xnthread 
*curr)
+ * @brief Release a resource and pass it to the next waiting thread.
  *
  * This service releases the ownership of the given synchronization
  * object. The thread which is currently leading the object's pending
@@ -644,7 +897,8 @@ static struct xnthread *transfer_ownership(struct xnsynch 
*synch,
  * @param synch The descriptor address of the synchronization object
  * whose ownership is changed.
  *
- * @param thread The descriptor address of the current owner.
+ * @param curr The descriptor address of the current thread, which
+ * must own the object at the time of calling.
  *
  * @return The descriptor address of the unblocked thread.
  *
@@ -652,7 +906,7 @@ static struct xnthread *transfer_ownership(struct xnsynch 
*synch,
  *
  * - The effective priority of the previous resource owner might be
  * lowered to its base priority value as a consequence of the priority
- * inheritance boost being cleared.
+ * boost being cleared.
  *
  * - The synchronization object ownership is transfered to the
  * unblocked thread.
@@ -660,77 +914,81 @@ static struct xnthread *transfer_ownership(struct xnsynch 
*synch,
  * @coretags{primary-only, might-switch}
  */
 struct xnthread *xnsynch_release(struct xnsynch *synch,
-                                struct xnthread *thread)
+                                struct xnthread *curr)
 {
-       xnhandle_t threadh;
+       struct xnthread *nextowner = NULL;
+       xnhandle_t currh, h;
        atomic_t *lockp;
+       spl_t s;
 
        XENO_BUG_ON(COBALT, (synch->status & XNSYNCH_OWNER) == 0);
 
        trace_cobalt_synch_release(synch);
 
-       if (xnthread_put_resource(thread))
+       if (xnthread_put_resource(curr))
                return NULL;
 
        lockp = xnsynch_fastlock(synch);
-       XENO_BUG_ON(COBALT, lockp == NULL);
-       threadh = thread->handle;
-       if (likely(xnsynch_fast_release(lockp, threadh)))
-               return NULL;
+       currh = curr->handle;
+       /*
+        * FLCEIL may only be raised by the owner, or when the owner
+        * is blocked waiting for the synch (ownership transfer). In
+        * addition, only the current owner of a synch may release it,
+        * therefore we can't race while testing FLCEIL locklessly.
+        * All updates to FLCLAIM are covered by the superlock.
+        *
+        * Therefore, clearing the fastlock racelessly in this routine
+        * without leaking FLCEIL/FLCLAIM updates can be achieved by
+        * holding the superlock.
+        */
+       xnlock_get_irqsave(&nklock, s);
+
+       h = atomic_cmpxchg(lockp, currh, XN_NO_HANDLE);
+       if ((h & ~XNSYNCH_FLCEIL) != currh)
+               /* FLCLAIM set, synch is contended. */
+               nextowner = transfer_ownership(synch, curr);
+       else if (h != currh)    /* FLCEIL set, FLCLAIM clear. */
+               atomic_set(lockp, XN_NO_HANDLE);
+
+       if (synch->status & XNSYNCH_PP)
+               clear_pp_boost(synch, curr);
 
-       return transfer_ownership(synch, thread);
+       xnlock_put_irqrestore(&nklock, s);
+
+       return nextowner;
 }
 EXPORT_SYMBOL_GPL(xnsynch_release);
 
 void xnsynch_requeue_sleeper(struct xnthread *thread)
-{
+{                              /* nklock held, irqs off */
        struct xnsynch *synch = thread->wchan;
        struct xnthread *owner;
 
+       XENO_BUG_ON(COBALT, !(synch->status & XNSYNCH_PRIO));
+
        /*
-        * Update the position of a thread waiting for a lock w/ PIP
-        * enabled.
+        * Update the position in the pend queue of a thread waiting
+        * for a lock. This routine propagates the change throughout
+        * the PI chain if required.
         */
-       if ((synch->status & XNSYNCH_PRIO) == 0)
-               return;
-
        list_del(&thread->plink);
        list_add_priff(thread, &synch->pendq, wprio, plink);
        owner = synch->owner;
 
-       if (owner == NULL || thread->wprio <= owner->wprio)
+       /* Only PI-enabled objects are of interest here. */
+       if ((synch->status & XNSYNCH_PI) == 0)
                return;
 
-       /*
-        * The new (weighted) priority of the sleeping thread is
-        * higher than the priority of the current owner of the
-        * resource: we need to update the PI state.
-        */
        synch->wprio = thread->wprio;
-       if (synch->status & XNSYNCH_CLAIMED) {
-               /*
-                * The resource is already claimed, just reorder the
-                * claim queue.
-                */
-               list_del(&synch->link);
-               list_add_priff(synch, &owner->claimq, wprio, link);
-       } else {
-               /*
-                * The resource was NOT claimed, claim it now and
-                * boost the owner.
-                */
+       if (synch->status & XNSYNCH_CLAIMED)
+               list_del(&synch->next);
+       else {
                synch->status |= XNSYNCH_CLAIMED;
-               list_add_priff(synch, &owner->claimq, wprio, link);
-               if (!xnthread_test_state(owner, XNBOOST)) {
-                       owner->bprio = owner->cprio;
-                       xnthread_set_state(owner, XNBOOST);
-               }
+               raise_boost_flag(owner);
        }
-       /*
-        * Renice the owner thread, progressing in the PI chain as
-        * needed.
-        */
-       xnsynch_renice_thread(owner, thread);
+
+       list_add_priff(synch, &owner->boosters, wprio, next);
+       adjust_boost(owner, thread);
 }
 EXPORT_SYMBOL_GPL(xnsynch_requeue_sleeper);
 
@@ -749,18 +1007,15 @@ EXPORT_SYMBOL_GPL(xnsynch_requeue_sleeper);
  */
 struct xnthread *xnsynch_peek_pendq(struct xnsynch *synch)
 {
-       struct xnthread *thread;
+       struct xnthread *thread = NULL;
        spl_t s;
 
        xnlock_get_irqsave(&nklock, s);
 
-       if (list_empty(&synch->pendq)) {
-               thread = NULL;
-               goto out;
-       }
+       if (!list_empty(&synch->pendq))
+               thread = list_first_entry(&synch->pendq,
+                                         struct xnthread, plink);
 
-       thread = list_first_entry(&synch->pendq, struct xnthread, plink);
-out:
        xnlock_put_irqrestore(&nklock, s);
 
        return thread;
@@ -772,18 +1027,17 @@ EXPORT_SYMBOL_GPL(xnsynch_peek_pendq);
  * @brief Unblock all waiters pending on a resource.
  *
  * This service atomically releases all threads which currently sleep
- * on a given resource.
- *
- * This service should be called by upper interfaces under
- * circumstances requiring that the pending queue of a given resource
- * is cleared, such as before the resource is deleted.
+ * on a given resource. This service should be called by upper
+ * interfaces under circumstances requiring that the pending queue of
+ * a given resource is cleared, such as before the resource is
+ * deleted.
  *
  * @param synch The descriptor address of the synchronization object
  * to be flushed.
  *
  * @param reason Some flags to set in the information mask of every
  * unblocked thread. Zero is an acceptable value. The following bits
- * are pre-defined by the nucleus:
+ * are pre-defined by Cobalt:
  *
  * - XNRMID should be set to indicate that the synchronization object
  * is about to be destroyed (see xnthread_resume()).
@@ -798,13 +1052,10 @@ EXPORT_SYMBOL_GPL(xnsynch_peek_pendq);
  *
  * @sideeffect
  *
- * - The effective priority of the previous resource owner might be
+ * - The effective priority of the current resource owner might be
  * lowered to its base priority value as a consequence of the priority
  * inheritance boost being cleared.
  *
- * - After this operation has completed, the synchronization object is
- * not owned by any thread.
- *
  * @coretags{unrestricted}
  */
 int xnsynch_flush(struct xnsynch *synch, int reason)
@@ -829,7 +1080,7 @@ int xnsynch_flush(struct xnsynch *synch, int reason)
                        xnthread_resume(sleeper, XNPEND);
                }
                if (synch->status & XNSYNCH_CLAIMED)
-                       clear_boost(synch, synch->owner);
+                       clear_pi_boost(synch, synch->owner);
        }
 
        xnlock_put_irqrestore(&nklock, s);
@@ -839,76 +1090,57 @@ int xnsynch_flush(struct xnsynch *synch, int reason)
 EXPORT_SYMBOL_GPL(xnsynch_flush);
 
 void xnsynch_forget_sleeper(struct xnthread *thread)
-{
-       struct xnsynch *synch = thread->wchan, *nsynch;
+{                              /* nklock held, irqs off */
+       struct xnsynch *synch = thread->wchan;
        struct xnthread *owner, *target;
 
        /*
         * Do all the necessary housekeeping chores to stop a thread
-        * from waiting on a given synchronization object.
+        * from waiting on a given synchronization object. Doing so
+        * may require to update a PI chain.
         */
        trace_cobalt_synch_forget(synch);
 
        xnthread_clear_state(thread, XNPEND);
        thread->wchan = NULL;
-       list_del(&thread->plink);
+       list_del(&thread->plink); /* synch->pendq */
 
+       /*
+        * Only a sleeper leaving a PI chain triggers an update.
+        * NOTE: PP objects never bear the CLAIMED bit.
+        */
        if ((synch->status & XNSYNCH_CLAIMED) == 0)
                return;
 
-       /* Find the highest priority needed to enforce the PIP. */
        owner = synch->owner;
 
        if (list_empty(&synch->pendq)) {
-               /* No more sleepers: clear the boost. */
-               clear_boost(synch, owner);
+               /* No more sleepers: clear the PI boost. */
+               clear_pi_boost(synch, owner);
                return;
        }
 
-       target = list_first_entry(&synch->pendq, struct xnthread, plink);
-       nsynch = list_first_entry(&owner->claimq, struct xnsynch, link);
-
-       if (target->wprio == nsynch->wprio)
-               return;         /* No change. */
-
        /*
-        * Reorder the claim queue, and lower the priority to the
-        * required minimum needed to prevent priority inversion.
+        * Reorder the booster queue of the current owner after we
+        * left the wait list, then set its priority to the new
+        * required minimum required to prevent priority inversion.
         */
+       target = list_first_entry(&synch->pendq, struct xnthread, plink);
        synch->wprio = target->wprio;
-       list_del(&synch->link);
-       list_add_priff(synch, &owner->claimq, wprio, link);
-
-       nsynch = list_first_entry(&owner->claimq, struct xnsynch, link);
-       if (nsynch->wprio < owner->wprio)
-               xnsynch_renice_thread(owner, target);
+       list_del(&synch->next); /* owner->boosters */
+       list_add_priff(synch, &owner->boosters, wprio, next);
+       adjust_boost(owner, target);
 }
 EXPORT_SYMBOL_GPL(xnsynch_forget_sleeper);
 
-void xnsynch_release_all_ownerships(struct xnthread *thread)
-{
-       struct xnsynch *synch, *tmp;
-
-       /*
-        * Release all the ownerships obtained by a thread on
-        * synchronization objects. This routine must be entered
-        * interrupts off.
-        */
-       xnthread_for_each_claimed_safe(synch, tmp, thread) {
-               xnsynch_release(synch, thread);
-               if (synch->cleanup)
-                       synch->cleanup(synch);
-       }
-}
-EXPORT_SYMBOL_GPL(xnsynch_release_all_ownerships);
-
 #if XENO_DEBUG(MUTEX_RELAXED)
 
 /*
  * Detect when a thread is about to sleep on a synchronization
  * object currently owned by someone running in secondary mode.
  */
-void xnsynch_detect_relaxed_owner(struct xnsynch *synch, struct xnthread 
*sleeper)
+void xnsynch_detect_relaxed_owner(struct xnsynch *synch,
+                                 struct xnthread *sleeper)
 {
        if (xnthread_test_state(sleeper, XNWARN) &&
            !xnthread_test_info(sleeper, XNPIALERT) &&
@@ -921,14 +1153,12 @@ void xnsynch_detect_relaxed_owner(struct xnsynch *synch, 
struct xnthread *sleepe
 }
 
 /*
- * Detect when a thread is about to relax while holding a
- * synchronization object currently claimed by another thread, which
- * bears the TWARNSW bit (thus advertising a concern about potential
- * spurious relaxes and priority inversion). By relying on the claim
- * queue, we restrict the checks to PIP-enabled objects, but that
- * already covers most of the use cases anyway.
+ * Detect when a thread is about to relax while holding booster(s)
+ * (claimed PI or active PP object), which denotes a potential for
+ * priority inversion. In such an event, any sleeper bearing the
+ * XNWARN bit will receive a SIGDEBUG notification.
  */
-void xnsynch_detect_claimed_relax(struct xnthread *owner)
+void xnsynch_detect_boosted_relax(struct xnthread *owner)
 {
        struct xnthread *sleeper;
        struct xnsynch *synch;
@@ -936,7 +1166,7 @@ void xnsynch_detect_claimed_relax(struct xnthread *owner)
 
        xnlock_get_irqsave(&nklock, s);
 
-       xnthread_for_each_claimed(synch, owner) {
+       xnthread_for_each_booster(synch, owner) {
                xnsynch_for_each_sleeper(sleeper, synch) {
                        if (xnthread_test_state(sleeper, XNWARN)) {
                                xnthread_set_info(sleeper, XNPIALERT);
diff --git a/kernel/cobalt/thread.c b/kernel/cobalt/thread.c
index e16f7a3..4325e12 100644
--- a/kernel/cobalt/thread.c
+++ b/kernel/cobalt/thread.c
@@ -158,7 +158,7 @@ int __xnthread_init(struct xnthread *thread,
 {
        int flags = attr->flags, ret, gravity;
 
-       flags &= ~XNSUSP;
+       flags &= ~(XNSUSP|XNBOOST);
 #ifndef CONFIG_XENO_ARCH_FPU
        flags &= ~XNFPU;
 #endif
@@ -186,6 +186,9 @@ int __xnthread_init(struct xnthread *thread,
        thread->state = flags;
        thread->info = 0;
        thread->local_info = 0;
+       thread->wprio = XNSCHED_IDLE_PRIO;
+       thread->cprio = XNSCHED_IDLE_PRIO;
+       thread->bprio = XNSCHED_IDLE_PRIO;
        thread->lock_count = 0;
        thread->rrperiod = XN_INFINITE;
        thread->wchan = NULL;
@@ -195,7 +198,7 @@ int __xnthread_init(struct xnthread *thread,
        thread->handle = XN_NO_HANDLE;
        memset(&thread->stat, 0, sizeof(thread->stat));
        thread->selector = NULL;
-       INIT_LIST_HEAD(&thread->claimq);
+       INIT_LIST_HEAD(&thread->boosters);
        /* These will be filled by xnthread_start() */
        thread->entry = NULL;
        thread->cookie = NULL;
@@ -465,39 +468,55 @@ static inline void release_fpu(struct xnthread *thread)
 
 #endif /* !CONFIG_XENO_ARCH_FPU */
 
-static inline void cleanup_tcb(struct xnthread *thread) /* nklock held, irqs 
off */
+static inline void release_all_ownerships(struct xnthread *curr)
 {
-       struct xnsched *sched = thread->sched;
+       struct xnsynch *synch, *tmp;
 
-       list_del(&thread->glink);
+       /*
+        * Release all the ownerships obtained by a thread on
+        * synchronization objects. This routine must be entered
+        * interrupts off.
+        */
+       xnthread_for_each_booster_safe(synch, tmp, curr) {
+               xnsynch_release(synch, curr);
+               if (synch->cleanup)
+                       synch->cleanup(synch);
+       }
+}
+
+static inline void cleanup_tcb(struct xnthread *curr) /* nklock held, irqs off 
*/
+{
+       struct xnsched *sched = curr->sched;
+
+       list_del(&curr->glink);
        cobalt_nrthreads--;
        xnvfile_touch_tag(&nkthreadlist_tag);
 
-       if (xnthread_test_state(thread, XNREADY)) {
-               XENO_BUG_ON(COBALT, xnthread_test_state(thread, 
XNTHREAD_BLOCK_BITS));
-               xnsched_dequeue(thread);
-               xnthread_clear_state(thread, XNREADY);
+       if (xnthread_test_state(curr, XNREADY)) {
+               XENO_BUG_ON(COBALT, xnthread_test_state(curr, 
XNTHREAD_BLOCK_BITS));
+               xnsched_dequeue(curr);
+               xnthread_clear_state(curr, XNREADY);
        }
 
-       if (xnthread_test_state(thread, XNPEND))
-               xnsynch_forget_sleeper(thread);
+       if (xnthread_test_state(curr, XNPEND))
+               xnsynch_forget_sleeper(curr);
 
-       xnthread_set_state(thread, XNZOMBIE);
+       xnthread_set_state(curr, XNZOMBIE);
        /*
-        * NOTE: we must be running over the root thread, or @thread
+        * NOTE: we must be running over the root thread, or @curr
         * is dormant, which means that we don't risk sched->curr to
         * disappear due to voluntary rescheduling while holding the
-        * nklock, despite @thread bears the zombie bit.
+        * nklock, despite @curr bears the zombie bit.
         */
-       xnsynch_release_all_ownerships(thread);
+       release_all_ownerships(curr);
 
-       giveup_fpu(sched, thread);
+       giveup_fpu(sched, curr);
 
-       if (moving_target(sched, thread))
+       if (moving_target(sched, curr))
                return;
 
-       xnsched_forget(thread);
-       xnthread_deregister(thread);
+       xnsched_forget(curr);
+       xnthread_deregister(curr);
 }
 
 void __xnthread_cleanup(struct xnthread *curr)
@@ -1890,12 +1909,12 @@ void xnthread_migrate_passive(struct xnthread *thread, 
struct xnsched *sched)
  * @sideeffect
  *
  * - This service does not call the rescheduling procedure but may
- * affect the state of the runnable queue for the previous and new
+ * affect the state of the run queue for the previous and new
  * scheduling classes.
  *
  * - Assigning the same scheduling class and parameters to a running
- * or ready thread moves it to the end of the runnable queue, thus
- * causing a manual round-robin.
+ * or ready thread moves it to the end of the run queue, thus causing
+ * a manual round-robin, except if a priority boost is undergoing.
  *
  * @coretags{task-unregistred}
  *
@@ -1925,21 +1944,6 @@ int __xnthread_set_schedparam(struct xnthread *thread,
 {
        int old_wprio, new_wprio, ret;
 
-       /*
-        * NOTE: we do not prevent the caller from altering the
-        * scheduling parameters of a thread that currently undergoes
-        * a PIP boost.
-        *
-        * Rationale: Calling xnthread_set_schedparam() carelessly
-        * with no consideration for resource management is a bug in
-        * essence, and xnthread_set_schedparam() does not have to
-        * paper over it, especially at the cost of more complexity
-        * when dealing with multiple scheduling classes.
-        *
-        * In short, callers have to make sure that lowering a thread
-        * priority is safe with respect to what their application
-        * currently does.
-        */
        old_wprio = thread->wprio;
 
        ret = xnsched_set_policy(thread, sched_class, sched_param);
@@ -1949,25 +1953,28 @@ int __xnthread_set_schedparam(struct xnthread *thread,
        new_wprio = thread->wprio;
 
        /*
-        * Update the pending order of the thread inside its wait
-        * queue, unless this behaviour has been explicitly disabled
-        * for the pended synchronization object, or the requested
-        * (weighted) priority has not changed, thus preventing
-        * spurious round-robin effects.
+        * If the thread is waiting on a synchronization object,
+        * update its position in the corresponding wait queue, unless
+        * 1) reordering is explicitly disabled, or 2) the (weighted)
+        * priority has not changed (to prevent spurious round-robin
+        * effects).
         */
-       if (old_wprio != new_wprio && thread->wchan != NULL &&
-           (thread->wchan->status & XNSYNCH_DREORD) == 0)
+       if (old_wprio != new_wprio && thread->wchan &&
+           (thread->wchan->status & (XNSYNCH_DREORD|XNSYNCH_PRIO))
+           == XNSYNCH_PRIO)
                xnsynch_requeue_sleeper(thread);
        /*
-        * We don't need/want to move the thread at the end of its
-        * priority group whenever:
-        * - it is blocked and thus not runnable;
-        * - it bears the ready bit in which case xnsched_set_policy()
-        * already reordered the runnable queue;
-        * - we currently hold the scheduler lock, so we don't want
-        * any round-robin effect to take place.
+        * We should not move the thread at the end of its priority
+        * group, if any of these conditions is true:
+        *
+        * - thread is not runnable;
+        * - thread bears the ready bit which means that xnsched_set_policy()
+        * already reordered the run queue;
+        * - thread currently holds the scheduler lock, so we don't want
+        * any round-robin effect to take place;
+        * - a priority boost is undergoing for this thread.
         */
-       if (!xnthread_test_state(thread, XNTHREAD_BLOCK_BITS|XNREADY) &&
+       if (!xnthread_test_state(thread, XNTHREAD_BLOCK_BITS|XNREADY|XNBOOST) &&
            thread->lock_count == 0)
                xnsched_putback(thread);
 
@@ -2177,7 +2184,7 @@ void xnthread_relax(int notify, int reason)
                        si.si_int = reason | sigdebug_marker;
                        send_sig_info(SIGDEBUG, &si, p);
                }
-               xnsynch_detect_claimed_relax(thread);
+               xnsynch_detect_boosted_relax(thread);
        }
 
        /*
@@ -2298,7 +2305,7 @@ static int force_wakeup(struct xnthread *thread) /* 
nklock locked, irqs off */
         * running by the scheduling policy module it belongs
         * to. Typically, policies enforcing a runtime budget do not
         * block threads with no budget, but rather keep them out of
-        * their runnable queue, so that ->sched_pick() won't elect
+        * their run queue, so that ->sched_pick() won't elect
         * them. We tell the policy handler about the fact that we do
         * want such thread to run until it relaxes, whatever this
         * means internally for the implementation.


_______________________________________________
Xenomai-git mailing list
Xenomai-git@xenomai.org
https://xenomai.org/mailman/listinfo/xenomai-git

Reply via email to