This patch introduces a new futex implementation called
throughput-optimized (TO) futexes. The goal of this new futex
type is to maximize locking throughput at the expense of fairness
and deterministic latency. Its throughput is higher than that of
the wait-wake futexes especially on systems with a large number of
CPUs and the lock owners are unlikely to sleep. The downside is the
increase in the response time variance. It also implements a lock
hand-off mechanism to make sure that lock starvation won't happen.

Using a futex locking microbenchmark to do 10 millions locking
operations with 5 pause instructions as the critical section by 256
threads (10M/256 locking ops each) on a 4-socket 72-core 144-thread
Haswell-EX system, the results of the benchmark runs were as follows:

                wait-wake futex     PI futex        TO futex
                ---------------     --------        --------
max time            3.49s            50.91s          2.65s
min time            3.24s            50.84s          0.07s
average time        3.41s            50.90s          1.84s
sys time          7m22.4s            55.73s        2m32.9s
lock count       3,090,294          9,999,813       698,318
unlock count     3,268,896          9,999,814           134

The lock and unlock counts above show the actual numbers of futex(2)
lock and unlock syscalls that were being issued.

Signed-off-by: Waiman Long <waiman.l...@hpe.com>
---
 include/uapi/linux/futex.h |    4 +
 kernel/futex.c             |  634 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 627 insertions(+), 11 deletions(-)

diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 0b1f716..e7deaf3 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -20,6 +20,8 @@
 #define FUTEX_WAKE_BITSET      10
 #define FUTEX_WAIT_REQUEUE_PI  11
 #define FUTEX_CMP_REQUEUE_PI   12
+#define FUTEX_LOCK_TO          13
+#define FUTEX_UNLOCK_TO                14
 
 #define FUTEX_PRIVATE_FLAG     128
 #define FUTEX_CLOCK_REALTIME   256
@@ -39,6 +41,8 @@
                                         FUTEX_PRIVATE_FLAG)
 #define FUTEX_CMP_REQUEUE_PI_PRIVATE   (FUTEX_CMP_REQUEUE_PI | \
                                         FUTEX_PRIVATE_FLAG)
+#define FUTEX_LOCK_TO_PRIVATE  (FUTEX_LOCK_TO | FUTEX_PRIVATE_FLAG)
+#define FUTEX_UNLOCK_TO_PRIVATE        (FUTEX_UNLOCK_TO | FUTEX_PRIVATE_FLAG)
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
diff --git a/kernel/futex.c b/kernel/futex.c
index f8bb93f..7daba56 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -191,6 +191,11 @@ int __read_mostly futex_cmpxchg_enabled;
 #define FLAGS_CLOCKRT          0x02
 #define FLAGS_HAS_TIMEOUT      0x04
 
+enum futex_type {
+       TYPE_PI = 0,
+       TYPE_TO,
+};
+
 /*
  * Futex state object:
  *  - Priority Inheritance state
@@ -203,13 +208,30 @@ struct futex_state {
        struct list_head list;
 
        /*
-        * The PI object:
+        * Linking into the owning hashed bucket (TO futexes only)
+        */
+       struct list_head hb_list;
+
+       /*
+        * The PI or mutex object:
         */
-       struct rt_mutex pi_mutex;
+       union {
+               struct rt_mutex pi_mutex;
+               struct mutex mutex;
+       };
 
+       /*
+        * For PI futex, owner is the task that owns the futex.
+        * For TO futex, owner is the mutex lock holder that is either
+        * spinning on the futex owner or sleeping.
+        */
        struct task_struct *owner;
        atomic_t refcount;
 
+       enum futex_type type;
+
+       u32 handoff_pid;        /* TO only, PID for lock hand-off */
+
        union futex_key key;
 };
 
@@ -262,6 +284,7 @@ struct futex_hash_bucket {
        atomic_t waiters;
        spinlock_t lock;
        struct plist_head chain;
+       struct list_head fs_list;       /* List of futex state objects */
 } ____cacheline_aligned_in_smp;
 
 /*
@@ -801,8 +824,11 @@ static int refill_futex_state_cache(void)
                return -ENOMEM;
 
        INIT_LIST_HEAD(&state->list);
+       INIT_LIST_HEAD(&state->hb_list);
+
        /* pi_mutex gets initialized later */
        state->owner = NULL;
+       state->handoff_pid = 0;
        atomic_set(&state->refcount, 1);
        state->key = FUTEX_KEY_INIT;
 
@@ -836,10 +862,10 @@ static void put_futex_state(struct futex_state *state)
                return;
 
        /*
-        * If state->owner is NULL, the owner is most probably dying
-        * and has cleaned up the futex state already
+        * If state->owner is NULL and the type is TYPE_PI, the owner
+        * is most probably dying and has cleaned up the state already
         */
-       if (state->owner) {
+       if (state->owner && (state->type == TYPE_PI)) {
                raw_spin_lock_irq(&state->owner->pi_lock);
                list_del_init(&state->list);
                raw_spin_unlock_irq(&state->owner->pi_lock);
@@ -847,6 +873,10 @@ static void put_futex_state(struct futex_state *state)
                rt_mutex_proxy_unlock(&state->pi_mutex, state->owner);
        }
 
+       /*
+        * Dequeue it from the HB futex state list.
+        */
+       list_del_init(&state->hb_list);
        if (current->pi_state_cache)
                kfree(state);
        else {
@@ -919,13 +949,24 @@ void exit_pi_state_list(struct task_struct *curr)
                        continue;
                }
 
-               WARN_ON(pi_state->owner != curr);
                WARN_ON(list_empty(&pi_state->list));
+               if (pi_state->type == TYPE_PI) {
+                       WARN_ON(pi_state->owner != curr);
+                       pi_state->owner = NULL;
+               }
                list_del_init(&pi_state->list);
-               pi_state->owner = NULL;
                raw_spin_unlock_irq(&curr->pi_lock);
 
-               rt_mutex_unlock(&pi_state->pi_mutex);
+               if (pi_state->type == TYPE_PI)
+                       rt_mutex_unlock(&pi_state->pi_mutex);
+               else if (pi_state->type == TYPE_TO) {
+                       /*
+                        * Need to wakeup the mutex owner.
+                        */
+                       WARN_ON(!pi_state->owner);
+                       if (pi_state->owner)
+                               wake_up_process(pi_state->owner);
+               }
 
                spin_unlock(&hb->lock);
 
@@ -997,7 +1038,7 @@ static int attach_to_pi_state(u32 uval, struct futex_state 
*pi_state,
        /*
         * Userspace might have messed up non-PI and PI futexes [3]
         */
-       if (unlikely(!pi_state))
+       if (unlikely(!pi_state || (pi_state->type != TYPE_PI)))
                return -EINVAL;
 
        WARN_ON(!atomic_read(&pi_state->refcount));
@@ -1115,6 +1156,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key 
*key,
 
        /* Store the key for possible exit cleanups: */
        pi_state->key = *key;
+       pi_state->type = TYPE_PI;
 
        WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
@@ -3075,8 +3117,8 @@ retry:
                        goto retry;
 
                /*
-                * Wake robust non-PI futexes here. The wakeup of
-                * PI futexes happens in exit_pi_state():
+                * Wake robust non-PI/TO futexes here. The wakeup of
+                * PI/TO futexes happens in exit_pi_state():
                 */
                if (!pi && (uval & FUTEX_WAITERS))
                        futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
@@ -3171,6 +3213,565 @@ void exit_robust_list(struct task_struct *curr)
                                   curr, pip);
 }
 
+#ifdef CONFIG_SMP
+/*
+ * Throughput-Optimized Futexes
+ * ----------------------------
+ *
+ * Userspace mutual exclusion locks can be implemented either by using the
+ * wait-wake futexes or the PI futexes. The wait-wake futexes have much higher
+ * throughput but can't guarantee minimal latency for high priority processes
+ * like the PI futexes. Even then, the overhead of wait-wake futexes in
+ * userspace locking primitives can easily become a performance bottleneck
+ * on system with a large number of cores.
+ *
+ * The throughput-optimized (TO) futex is a new futex implementation that
+ * provides higher throughput than wait-wake futex via the use of following
+ * two techniques:
+ *  1) Optimistic spinning when futex owner is actively running
+ *  2) Lock stealing
+ *
+ * Optimistic spinning isn't present in other futexes. Lock stealing is
+ * possible in wait-wake futexes, but isn't actively encouraged like the
+ * TO futexes. The downside, however, is in the much higher variance of
+ * the response times.
+ *
+ * TO futexes have a kernel builtin lock hand-off mechanism to prevent lock
+ * starvation as long as the mutex code can guarantee there is no lock
+ * starvation. For the wait-wake futexes, however, lock starvation prevention
+ * has to be done in the userspace, if desired.
+ *
+ * The use of TO futexes is very similar to the PI futexes. Locking is done
+ * by atomically transiting the futex from 0 to the task's thread ID.
+ * Unlocking is done by atomically changing the futex from thread ID to 0.
+ * Any failure to do so will require calls to the kernel to do the locking
+ * and unlocking.
+ *
+ * Within the kernel, trylocks are done ignoring the FUTEX_WAITERS bit. The
+ * purpose of this FUTEX_WAITERS bit is to make the unlocker wake up the
+ * serialization mutex owner.
+ *
+ * Like PI futexes, TO futexes are orthogonal to robust futexes. However,
+ * TO futexes have similar dying process handling as the PI futexes. So
+ * userspace management of locks in for exiting lock holders should not be
+ * needed for TO futexes unless to avoid the rare cases of pid wraparound.
+ *
+ * Unlike the other futexes, waiters of the TO futexes will not queue
+ * themselves to the plist of the hash bucket. Instead, they will queue up
+ * in the serialization mutex of the futex state container queued in the
+ * hash bucket.
+ */
+
+/*
+ * Looking up the futex state structure.
+ *
+ * It differs from lookup_pi_state() in that it searches the fs_list in the
+ * hash bucket instead of the waiters in the plist. The HB lock must be held
+ * before calling it. If the search_only flag isn't set, a new state structure
+ * will be pushed into the list and returned if there is no matching key.
+ *
+ * The reference count won't be incremented for search_only call as
+ * the futex state won't be destroyed while the HB lock is being held.
+ */
+static struct futex_state *
+lookup_futex_state(struct futex_hash_bucket *hb, union futex_key *key,
+                  bool search_only)
+{
+       struct futex_state *state;
+
+       list_for_each_entry(state, &hb->fs_list, hb_list)
+               if (match_futex(key, &state->key)) {
+                       if (!search_only)
+                               atomic_inc(&state->refcount);
+                       return state;
+               }
+
+       if (search_only)
+               return NULL;
+
+       /*
+        * Push a new one into the list and return it.
+        */
+       state = alloc_futex_state();
+       state->type = TYPE_TO;
+       state->key = *key;
+       list_add(&state->hb_list, &hb->fs_list);
+       WARN_ON(atomic_read(&state->refcount) != 1);
+
+       /*
+        * Initialize the mutex structure.
+        */
+       mutex_init(&state->mutex);
+       WARN_ON(!list_empty(&state->list));
+       return state;
+}
+
+/*
+ * Fast dropping of reference to the futex state object while not holding
+ * the HB lock.
+ *
+ * Return: 1 if successful and 0 if the reference count is going to be
+ *        decremented to 0 and hence has to be done under lock.
+ */
+static inline int put_futex_state_unlocked(struct futex_state *state)
+{
+       return atomic_add_unless(&state->refcount, -1, 1);
+}
+
+/*
+ * Spinning threshold before enabling lock handoff.
+ * Each sleep will decrement the threshold by 1/32 of the start value.
+ */
+#define TO_SPIN_THRESHOLD      (1 << 13)
+#define TO_SLEEP_DECREMENT     (TO_SPIN_THRESHOLD/32)
+
+/*
+ * futex_lock_to returned values:
+ * 0 - mutex owner acquires the lock
+ * 1 - steals the lock
+ * 2 - handed off the lock
+ */
+#define TO_LOCK_ACQUIRED       0
+#define TO_LOCK_STOLEN         1
+#define TO_LOCK_HANDOFF                2
+
+/*
+ * Try to lock the userspace futex word (0 => vpid).
+ *
+ * Return: 1 if lock acquired or an error happens, 0 if not.
+ *        The status code will be 0 if no error, or < 0 if an error happens.
+ *        *puval will contain the latest futex value when trylock fails.
+ *
+ * The HB spinlock should NOT be held while calling this function.
+ * The flag bits are ignored in the trylock.
+ *
+ * If waiter is true
+ * then
+ *   don't preserve the flag bits;
+ *   check for handoff (futex word == own pid)
+ * else
+ *   preserve the flag bits
+ * endif
+ */
+static inline int futex_trylock_to(u32 __user *uaddr, u32 vpid, u32 *puval,
+                                  int *status, const bool waiter)
+{
+       u32 uval, flags = 0;
+
+       *status = 0;
+
+       if (unlikely(get_user(uval, uaddr)))
+               goto efault;
+
+       *puval = uval;
+
+       if (!waiter) {
+               flags |= (uval & ~FUTEX_TID_MASK);
+       } else if ((uval & FUTEX_TID_MASK) == vpid) {
+               *status = TO_LOCK_HANDOFF;
+               return 1;
+       }
+
+       if (uval & FUTEX_TID_MASK)
+               return 0;       /* Trylock fails */
+
+       if (unlikely(futex_atomic_cmpxchg_inatomic(puval, uaddr, uval,
+                                                  vpid | flags)))
+               goto efault;
+
+       return *puval == uval;
+
+efault:
+       *status = -EFAULT;
+       return 1;
+}
+
+/*
+ * Set the FUTEX_WAITERS bit while not holding the HB lock. The given puval
+ * value points to the intial value of the futex lock.
+ *
+ * Return: 0 if successful or < 0 if an error happen.
+ */
+static inline int futex_set_waiters_unlocked(u32 __user *uaddr, u32 *puval)
+{
+       u32 curval, uval = *puval;
+
+       while (!(uval & FUTEX_WAITERS)) {
+               /*
+                * Set the FUTEX_WAITERS bit.
+                */
+               if (futex_atomic_cmpxchg_inatomic(&curval,
+                       uaddr, uval, uval | FUTEX_WAITERS))
+                       return -1;
+               if (curval == uval)
+                       break;
+               uval = curval;
+       }
+       *puval = uval | FUTEX_WAITERS;
+       return 0;
+}
+
+/*
+ * Spin on the futex word while the futex owner is active. Otherwise, set
+ * the FUTEX_WAITERS bit and go to sleep. As we take a reference to the futex
+ * owner's task structure, we don't need to use RCU to ensure that the task
+ * structure is valid. The function will directly grab the lock if the
+ * owner is dying or the pid is invalid. That should take care of the problem
+ * of dead lock owners unless the pid wraps around and the preceived owner is
+ * not the real owner.
+ *
+ * Return: 0 if futex acquired, < 0 if an error happens.
+ */
+static int futex_spin_on_owner(u32 __user *uaddr, u32 vpid,
+                              struct futex_state *state)
+{
+       int ret, loop = TO_SPIN_THRESHOLD;
+       u32 uval, curval;
+       u32 opid = 0;                           /* Futex owner task ID */
+       struct task_struct *otask = NULL;       /* Futex owner task struct */
+       bool on_owner_list = false;
+
+       WRITE_ONCE(state->owner, current);
+       preempt_disable();
+       for (;; loop--) {
+               if (futex_trylock_to(uaddr, vpid, &uval, &ret, true))
+                       break;
+
+               if (((uval & FUTEX_TID_MASK) != opid) ||
+                    (uval & FUTEX_OWNER_DIED)) {
+                       /*
+                        * Get the new task structure
+                        */
+                       if (otask) {
+                               if (on_owner_list) {
+                                       raw_spin_lock_irq(&otask->pi_lock);
+                                       list_del_init(&state->list);
+                                       raw_spin_unlock_irq(&otask->pi_lock);
+                                       on_owner_list = false;
+                               }
+                               put_task_struct(otask);
+                       }
+
+                       if (likely(!(uval & FUTEX_OWNER_DIED))) {
+                               opid  = uval & FUTEX_TID_MASK;
+                               otask = futex_find_get_task(opid);
+                       } else {
+                               opid = 0;
+                               otask = NULL;
+                       }
+               }
+               if (unlikely(!otask || (otask->flags & PF_EXITING) ||
+                           (uval & FUTEX_OWNER_DIED))) {
+                       /*
+                        * PID invalid or exiting/dead task, try to grab
+                        * the lock now.
+                        */
+                       ret = futex_atomic_cmpxchg_inatomic(&curval,
+                                       uaddr, uval, vpid);
+                       if (unlikely(ret))
+                               goto efault;
+                       if (curval != uval)
+                               continue;       /* Futex value changed */
+                       pr_info("futex_spin_on_owner: pid %d grabs futex from 
pid %d (%s)!\n",
+                               vpid, opid, otask ? "dying" : "invalid");
+                       break;
+               }
+
+               if (need_resched()) {
+                       __set_current_state(TASK_RUNNING);
+                       schedule_preempt_disabled();
+                       continue;
+               }
+
+               /* Check for signal */
+               if (signal_pending(current)) {
+                       ret = -EINTR;
+                       break;
+               }
+
+               /*
+                * Enable lock handoff if the threshold reaches 0.
+                * We also need to set the FUTEX_WAITERS bit to make sure
+                * that futex lock holder will initiate the handoff at
+                * unlock time.
+                */
+               if ((loop <= 0) && !READ_ONCE(state->handoff_pid)) {
+                       WRITE_ONCE(state->handoff_pid, vpid);
+               } else if (otask->on_cpu) {
+                       cpu_relax();
+                       continue;
+               }
+
+               /*
+                * If the owner isn't active, we need to go to sleep after
+                * making sure that the FUTEX_WAITERS bit is set. We also
+                * need to put the futex state into the futex owner's
+                * pi_state_list to prevent deadlock when the owner dies.
+                */
+               if (futex_set_waiters_unlocked(uaddr, &uval) < 0)
+                       goto efault;
+
+               if (otask->on_cpu) {
+                       cpu_relax();
+                       continue;
+               }
+
+               if (!on_owner_list) {
+                       raw_spin_lock_irq(&otask->pi_lock);
+                       if (unlikely(otask->flags & PF_EXITING)) {
+                               /*
+                                * Task is exiting, can directly
+                                * grab the futex instead.
+                                */
+                               raw_spin_unlock_irq(&otask->pi_lock);
+                               continue;
+                       }
+                       WARN_ON(!list_empty(&state->list));
+                       list_add(&state->list, &otask->pi_state_list);
+                       raw_spin_unlock_irq(&otask->pi_lock);
+                       on_owner_list = true;
+               }
+
+               /*
+                * Do a trylock after setting the task state to make
+                * sure we won't miss a wakeup.
+                *
+                * Futex owner          Mutex owner
+                * -----------          -----------
+                *  unlock              set state
+                *  MB                  MB
+                *  read state          trylock
+                *  wakeup              sleep
+                */
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (futex_trylock_to(uaddr, vpid, &uval, &ret, true)) {
+                       __set_current_state(TASK_RUNNING);
+                       break;
+               }
+
+               /*
+                * Don't sleep if the owner has died or the FUTEX_WAITERS bit
+                * was cleared. The latter case can happen when unlock and
+                * lock stealing happen in between setting the FUTEX_WAITERS
+                * and setting state to TASK_INTERRUPTIBLE.
+                */
+               if (!(uval & FUTEX_OWNER_DIED) && (uval & FUTEX_WAITERS)) {
+                       schedule_preempt_disabled();
+                       loop -= TO_SLEEP_DECREMENT;
+               }
+               __set_current_state(TASK_RUNNING);
+       }
+out:
+       preempt_enable();
+       if (on_owner_list && !list_empty(&state->list)) {
+               BUG_ON(!otask);
+               raw_spin_lock_irq(&otask->pi_lock);
+               list_del_init(&state->list);
+               raw_spin_unlock_irq(&otask->pi_lock);
+       }
+       if (otask)
+               put_task_struct(otask);
+       /*
+        * Cleanup futex state.
+        */
+       WRITE_ONCE(state->owner, NULL);
+       WRITE_ONCE(state->handoff_pid, 0);
+       return ret;
+
+efault:
+       ret = -EFAULT;
+       goto out;
+}
+
+/*
+ * Userspace tried a 0 -> TID atomic transition of the futex value
+ * and failed. The kernel side here does the whole locking operation.
+ * The kernel mutex will be used for serialization. Once becoming the
+ * sole mutex lock owner, it will spin on the futex owner's task structure
+ * to see if it is running. It will also spin on the futex word so as to grab
+ * the lock as soon as it is free.
+ *
+ * This function is not inlined so that it can show up in stack trace for
+ * analysis purpose.
+ *
+ * Return:
+ *  < 0 - an error happens
+ *  0   - acquires the lock via futex_spin_on_owner()
+ *  1   - steals the lock
+ *  2   - lock handed off from unlocker
+ */
+static noinline int
+futex_lock_to(u32 __user *uaddr, unsigned int flags)
+{
+       struct futex_hash_bucket *hb;
+       union futex_key key = FUTEX_KEY_INIT;
+       struct futex_state *state;
+       u32 uval, vpid = task_pid_vnr(current);
+       int ret;
+
+
+       /*
+        * Stealing lock and preserve the flag bits.
+        */
+       if (futex_trylock_to(uaddr, vpid, &uval, &ret, false))
+               /* Lock acquired or an error happens */
+               return (ret < 0) ? ret : TO_LOCK_STOLEN;
+
+       /*
+        * Detect deadlocks.
+        */
+       if (unlikely(((uval & FUTEX_TID_MASK) == vpid) ||
+                       should_fail_futex(true)))
+               return -EDEADLK;
+
+       if (refill_futex_state_cache())
+               return -ENOMEM;
+
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
+       if (unlikely(ret))
+               goto out;
+
+       hb = hash_futex(&key);
+       spin_lock(&hb->lock);
+
+       /*
+        * Locate the futex state by looking up the futex state list in the
+        * hash bucket. If it isn't found, create a new one and put it into
+        * the list.
+        */
+       state = lookup_futex_state(hb, &key, false);
+
+       /*
+        * We don't need to hold the HB lock after looking up the futex state
+        * as we have incremented the reference count.
+        */
+       spin_unlock(&hb->lock);
+       BUG_ON(!state);
+
+       /*
+        * Acquiring the serialization mutex.
+        */
+       if (state->type != TYPE_TO)
+               ret = -EINVAL;
+       else
+               ret = mutex_lock_interruptible(&state->mutex);
+
+       if (unlikely(ret))
+               /*
+                * We got a signal or has some other error, need to abort
+                * the lock operation and return.
+                */
+               goto out_put_state_key;
+
+       /*
+        * As the mutex owner, we can now spin on the futex word as well as
+        * the active-ness of the futex owner.
+        */
+       ret = futex_spin_on_owner(uaddr, vpid, state);
+
+       mutex_unlock(&state->mutex);
+
+out_put_state_key:
+       if (!put_futex_state_unlocked(state)) {
+               /*
+                * May need to free the futex state object and so must be
+                * under HB lock.
+                */
+               spin_lock(&hb->lock);
+               put_futex_state(state);
+               spin_unlock(&hb->lock);
+       }
+       put_futex_key(&key);
+
+out:
+       return ret;
+}
+
+/*
+ * Userspace attempted a TID -> 0 atomic transition, and failed.
+ * This is the in-kernel slowpath: we look up the futex state (if any),
+ * and wakeup the mutex owner.
+ *
+ * Return: 1 if a wakeup is attempt, 0 if no task to wake,
+ *        or < 0 when an error happens.
+ */
+static int futex_unlock_to(u32 __user *uaddr, unsigned int flags)
+{
+       u32 uval, newpid = 0, vpid = task_pid_vnr(current);
+       union futex_key key = FUTEX_KEY_INIT;
+       struct futex_hash_bucket *hb;
+       struct futex_state *state = NULL;
+       struct task_struct *owner = NULL;
+       int ret;
+       WAKE_Q(wake_q);
+
+       if (get_user(uval, uaddr))
+               return -EFAULT;
+
+       if ((uval & FUTEX_TID_MASK) != vpid)
+               return -EPERM;
+
+       if (!(uval & FUTEX_WAITERS))
+               return -EINVAL;
+
+       ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
+       if (ret)
+               return ret;
+
+       hb = hash_futex(&key);
+       spin_lock(&hb->lock);
+
+       /*
+        * Check the hash bucket only for matching futex state.
+        */
+       state = lookup_futex_state(hb, &key, true);
+       WARN_ON_ONCE(!state);
+
+       if (state) {
+               if (state->type != TYPE_TO) {
+                       ret = -EINVAL;
+                       goto out_unlock;
+               }
+
+               newpid = READ_ONCE(state->handoff_pid);
+               if (newpid)
+                       WRITE_ONCE(state->handoff_pid, 0);
+
+               owner = READ_ONCE(state->owner);
+               if (owner)
+                       wake_q_add(&wake_q, owner);
+       }
+
+       /*
+        * Unlock the futex or handoff to the next owner.
+        * The flag bits are not preserved to encourage more lock stealing.
+        */
+       for (;;) {
+               u32 old = uval;
+
+               if (cmpxchg_futex_value_locked(&uval, uaddr, old, newpid)) {
+                       ret = -EFAULT;
+                       goto out_unlock;
+               }
+               if (old == uval)
+                       break;
+       }
+
+out_unlock:
+       spin_unlock(&hb->lock);
+       put_futex_key(&key);
+       if (owner) {
+               /*
+                * No error has happened if owner defined.
+                */
+               wake_up_q(&wake_q);
+               return 1;
+       }
+
+       return ret;
+}
+#endif /* CONFIG_SMP */
+
 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                u32 __user *uaddr2, u32 val2, u32 val3)
 {
@@ -3193,6 +3794,10 @@ long do_futex(u32 __user *uaddr, int op, u32 val, 
ktime_t *timeout,
        case FUTEX_TRYLOCK_PI:
        case FUTEX_WAIT_REQUEUE_PI:
        case FUTEX_CMP_REQUEUE_PI:
+#ifdef CONFIG_SMP
+       case FUTEX_LOCK_TO:
+       case FUTEX_UNLOCK_TO:
+#endif
                if (!futex_cmpxchg_enabled)
                        return -ENOSYS;
        }
@@ -3224,6 +3829,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, 
ktime_t *timeout,
                                             uaddr2);
        case FUTEX_CMP_REQUEUE_PI:
                return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
+#ifdef CONFIG_SMP
+       case FUTEX_LOCK_TO:
+               return futex_lock_to(uaddr, flags);
+       case FUTEX_UNLOCK_TO:
+               return futex_unlock_to(uaddr, flags);
+#endif
        }
        return -ENOSYS;
 }
@@ -3307,6 +3918,7 @@ static int __init futex_init(void)
        for (i = 0; i < futex_hashsize; i++) {
                atomic_set(&futex_queues[i].waiters, 0);
                plist_head_init(&futex_queues[i].chain);
+               INIT_LIST_HEAD(&futex_queues[i].fs_list);
                spin_lock_init(&futex_queues[i].lock);
        }
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-doc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to