[PATCH v3 02/13] futex: Rename futex_pi_state to futex_state
The futex_pi_state structure will be overloaded in later patches to store state information about non-PI futexes. So the structure name itself is no longer a good description of its purpose. So its name is changed to futex_state, a more generic name. Some of the functions that process the futex states are also renamed. Signed-off-by: Waiman Long--- include/linux/sched.h |4 +- kernel/futex.c| 107 + 2 files changed, 56 insertions(+), 55 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 62c68e5..fefd7f7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -126,7 +126,7 @@ struct sched_attr { u64 sched_period; }; -struct futex_pi_state; +struct futex_state; struct robust_list_head; struct bio_list; struct fs_struct; @@ -1772,7 +1772,7 @@ struct task_struct { struct compat_robust_list_head __user *compat_robust_list; #endif struct list_head pi_state_list; - struct futex_pi_state *pi_state_cache; + struct futex_state *pi_state_cache; #endif #ifdef CONFIG_PERF_EVENTS struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts]; diff --git a/kernel/futex.c b/kernel/futex.c index 91724ec..f4e20ad 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -192,11 +192,12 @@ int __read_mostly futex_cmpxchg_enabled; #define FLAGS_HAS_TIMEOUT 0x04 /* - * Priority Inheritance state: + * Futex state object: + * - Priority Inheritance state */ -struct futex_pi_state { +struct futex_state { /* -* list of 'owned' pi_state instances - these have to be +* list of 'owned' state instances - these have to be * cleaned up in do_exit() if the task exits prematurely: */ struct list_head list; @@ -240,7 +241,7 @@ struct futex_q { struct task_struct *task; spinlock_t *lock_ptr; union futex_key key; - struct futex_pi_state *pi_state; + struct futex_state *pi_state; struct rt_mutex_waiter *rt_waiter; union futex_key *requeue_pi_key; u32 bitset; @@ -797,76 +798,76 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) /* * PI code: */ -static int refill_pi_state_cache(void) +static int refill_futex_state_cache(void) { - struct futex_pi_state *pi_state; + struct futex_state *state; if (likely(current->pi_state_cache)) return 0; - pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); + state = kzalloc(sizeof(*state), GFP_KERNEL); - if (!pi_state) + if (!state) return -ENOMEM; - INIT_LIST_HEAD(_state->list); + INIT_LIST_HEAD(>list); /* pi_mutex gets initialized later */ - pi_state->owner = NULL; - atomic_set(_state->refcount, 1); - pi_state->key = FUTEX_KEY_INIT; + state->owner = NULL; + atomic_set(>refcount, 1); + state->key = FUTEX_KEY_INIT; - current->pi_state_cache = pi_state; + current->pi_state_cache = state; return 0; } -static struct futex_pi_state * alloc_pi_state(void) +static struct futex_state *alloc_futex_state(void) { - struct futex_pi_state *pi_state = current->pi_state_cache; + struct futex_state *state = current->pi_state_cache; - WARN_ON(!pi_state); + WARN_ON(!state); current->pi_state_cache = NULL; - return pi_state; + return state; } /* - * Drops a reference to the pi_state object and frees or caches it + * Drops a reference to the futex state object and frees or caches it * when the last reference is gone. * * Must be called with the hb lock held. */ -static void put_pi_state(struct futex_pi_state *pi_state) +static void put_futex_state(struct futex_state *state) { - if (!pi_state) + if (!state) return; - if (!atomic_dec_and_test(_state->refcount)) + if (!atomic_dec_and_test(>refcount)) return; /* -* If pi_state->owner is NULL, the owner is most probably dying -* and has cleaned up the pi_state already +* If state->owner is NULL, the owner is most probably dying +* and has cleaned up the futex state already */ - if (pi_state->owner) { - raw_spin_lock_irq(_state->owner->pi_lock); - list_del_init(_state->list); - raw_spin_unlock_irq(_state->owner->pi_lock); + if (state->owner) { + raw_spin_lock_irq(>owner->pi_lock); + list_del_init(>list); + raw_spin_unlock_irq(>owner->pi_lock); - rt_mutex_proxy_unlock(_state->pi_mutex, pi_state->owner); + rt_mutex_proxy_unlock(>pi_mutex, state->owner); } if (current->pi_state_cache) - kfree(pi_state); + kfree(state); else { /* -*
[PATCH v3 05/13] futex: Add a new futex type field into futex_state
As the futex_state structure will be overloaded in later patches to be used by non-PI futexes, it is necessary to add a type field to distinguish among different types of futexes. Signed-off-by: Waiman Long--- kernel/futex.c | 14 ++ 1 files changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0d1b1be..a496c01 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -191,6 +191,10 @@ int __read_mostly futex_cmpxchg_enabled; #define FLAGS_CLOCKRT 0x02 #define FLAGS_HAS_TIMEOUT 0x04 +enum futex_type { + TYPE_PI = 0, +}; + /* * Futex state object: * - Priority Inheritance state @@ -210,6 +214,7 @@ struct futex_state { struct task_struct *owner; atomic_t refcount; + enum futex_type type; union futex_key key; }; @@ -897,10 +902,10 @@ static void put_futex_state(struct futex_state *state) return; /* -* If state->owner is NULL, the owner is most probably dying -* and has cleaned up the futex state already +* If state->owner is NULL and the type is TYPE_PI, the owner is +* most probably dying and has cleaned up the futex state already */ - if (state->owner) { + if (state->owner && (state->type == TYPE_PI)) { task_pi_list_del(state->owner, state, false); rt_mutex_proxy_unlock(>pi_mutex, state->owner); @@ -1056,7 +1061,7 @@ static int attach_to_pi_state(u32 uval, struct futex_state *pi_state, /* * Userspace might have messed up non-PI and PI futexes [3] */ - if (unlikely(!pi_state)) + if (unlikely(!pi_state || (pi_state->type != TYPE_PI))) return -EINVAL; WARN_ON(!atomic_read(_state->refcount)); @@ -1174,6 +1179,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, /* Store the key for possible exit cleanups: */ pi_state->key = *key; + pi_state->type = TYPE_PI; WARN_ON(!list_empty(_state->list)); list_add(_state->list, >pi_state_list); -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 09/13] futex: Implement lock handoff for TP futexes to prevent lock starvation
The current TP futexes has no guarantee that the top waiter (serialization mutex owner) can get the lock within a finite time. As a result, lock starvation can happen. A lock handoff mechanism is added to the TP futexes to prevent lock starvation from happening. The idea is that the top waiter can set a special handoff_pid variable with its own pid. The futex owner will then check this variable at unlock time to see if it should free the futex or change its value to the designated pid to transfer the lock directly to the top waiter. This change does have the effect of limiting the amount of unfairness and hence may adversely affect performance depending on the workloads. Using a futex locking microbenchmark on a 4-socket 72-core 144-thread Haswell-EX system running 4.8-rc7 based kernel, the performance data for the TP futexes before and after the patch were as follows: CS length Before patchAfter patch% change - --- 5 54,622,973 59,455,380 +8.8% 10 42,577,283 41,161,387 -3.3% 20 29,424,333 37,718,989 +28.2% 30 33,664,095 29,474,986 -12.4% 40 27,998,446 24,774,389 -11.5% 50 25,793,991 22,718,432 -11.9% 1us sleep 179,067 177,662 -0.8% In fact, when the critical section is a 1us sleep, the wait-wake futexes perform a bit better than the TP futexes as the former futexes are more unfair in this case. So lock starvation can happen with wait-wake futexes. WW futexTP futex Total locking ops 175,824 177,662 Per-thread avg/sec 122 123 Per-thread min/sec2 106 Per-thread max/sec 498 199 % Stddev 6.26% 0.97% Signed-off-by: Waiman Long--- kernel/futex.c | 55 --- 1 files changed, 48 insertions(+), 7 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 5fa9a10..d260410 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -228,6 +228,8 @@ struct futex_state { struct task_struct *owner; atomic_t refcount; + u32 handoff_pid;/* For TP futexes only */ + enum futex_type type; union futex_key key; }; @@ -898,6 +900,7 @@ static int refill_futex_state_cache(void) /* pi_mutex gets initialized later */ state->owner = NULL; + state->handoff_pid = 0; atomic_set(>refcount, 1); state->key = FUTEX_KEY_INIT; @@ -3303,8 +3306,9 @@ void exit_robust_list(struct task_struct *curr) * * Within the kernel, trylocks are done ignoring the FUTEX_WAITERS bit. The * purpose of this FUTEX_WAITERS bit is to make the unlocker wake up the - * serialization mutex owner. Not having the FUTEX_WAITERS bit set doesn't - * mean there is no waiter in the kernel. + * serialization mutex owner or the hand off the lock directly to the top + * waiter. Not having the FUTEX_WAITERS bit set doesn't mean there is no + * waiter in the kernel. * * Like PI futexes, TP futexes are orthogonal to robust futexes can be * used together. @@ -3320,6 +3324,19 @@ void exit_robust_list(struct task_struct *curr) * the ownership of the futex. */ +/* + * Spinning and sleeping thresholds for enabling lock handoff and prevent + * lock starvation. + * + * The current threshold values setting is kind of arbitrary. Setting them + * too low will impact the amount of lock stealing possible thus reducing + * performance. Setting them too high may make the top waiter wait too long + * before getting the lock. The current set of values are a bit on the + * high side to ensure adequate performance. + */ +#define TP_SPIN_THRESHOLD (1 << 14) +#define TP_SLEEP_DECREMENT (TP_SPIN_THRESHOLD/64) + /** * lookup_futex_state - Looking up the futex state structure. * @hb: hash bucket @@ -3398,6 +3415,7 @@ static inline int put_futex_state_unlocked(struct futex_state *state) * If waiter is true * then * don't preserve the flag bits; + * check for handoff (futex word == own pid) * else * preserve the flag bits * endif @@ -3416,6 +3434,9 @@ static inline int futex_trylock(u32 __user *uaddr, u32 vpid, u32 *puval, *puval = uval; + if (waiter && (uval & FUTEX_TID_MASK) == vpid) + return 1; + if (uval & FUTEX_TID_MASK) return 0; /* Trylock fails */ @@ -3478,7 +3499,7 @@ static inline int futex_set_waiters_bit(u32 __user *uaddr, u32 *puval) static int futex_spin_on_owner(u32 __user *uaddr, u32 vpid, struct futex_state *state) { - int ret; + int ret, loop = TP_SPIN_THRESHOLD; u32 uval; u32 owner_pid = 0; struct
[PATCH v3 08/13] futex: Enable robust handling of TP futexes
The TP futexes don't have code to handle the death of futex owners. There are 2 different cases that need to be considered. As top waiter gets a reference to the task structure of the futex owner, the task structure will never go away even if the owner dies. When the futex owner died while the top waiter is spinning, the task structure will be marked dead or the pid won't have a matching task structure if the task died before a reference is taken. Alternatively, if robust futex attribute is enabled, the FUTEX_OWNER_DIED bit of the futex word may be set. In all those cases, what the top waiter need to do is to grab the futex directly. An informational message will also printed to highlight this event. If the futex owner died while the top waiter is sleeping, we need to make the exit processing code to wake up the top waiter. This is done by chaining the futex state object into the pi_state_list of the futex owner before the top waiter sleeps so that if exit_pi_state_list() is called, the wakeup will happen. The top waiter needs to remove its futex state object from the pi_state_list of the old owner if the ownership changes hand. Signed-off-by: Waiman Long--- kernel/futex.c | 82 +++ 1 files changed, 76 insertions(+), 6 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index bfe17a9..5fa9a10 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -973,7 +973,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) } /* - * This task is holding PI mutexes at exit time => bad. + * This task is holding PI or TP mutexes at exit time => bad. * Kernel cleans up PI-state, but userspace is likely hosed. * (Robust-futex cleanup is separate and might save the day for userspace.) */ @@ -990,12 +990,29 @@ void exit_pi_state_list(struct task_struct *curr) * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: +* +* For TP futexes, the only purpose of showing up in the +* pi_state_list is for this function to wake up the serialization +* mutex owner (state->owner). We don't actually need to take the +* HB lock. The futex state and task struct won't go away as long +* as we hold the pi_lock. */ raw_spin_lock_irq(>pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_state, list); + + if (pi_state->type == TYPE_TP) { + struct task_struct *owner = READ_ONCE(pi_state->owner); + + WARN_ON(list_empty(_state->list)); + list_del_init(_state->list); + if (owner) + wake_up_process(owner); + continue; + } + key = pi_state->key; hb = hash_futex(); raw_spin_unlock_irq(>pi_lock); @@ -3152,8 +3169,8 @@ retry: goto retry; /* -* Wake robust non-PI futexes here. The wakeup of -* PI futexes happens in exit_pi_state(): +* Wake robust wait-wake futexes here. The wakeup of +* PI and TP futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); @@ -3295,6 +3312,12 @@ void exit_robust_list(struct task_struct *curr) * Unlike the other futexes, the futex_q structures aren't used. Instead, * they will queue up in the serialization mutex of the futex state container * queued in the hash bucket. + * + * To handle the exceptional case that the futex owner died, the robust + * futexes list mechanism is used to for waking up sleeping top waiter. + * Checks are also made in the futex_spin_on_owner() loop for dead task + * structure or invalid pid. In both cases, the top waiter will take over + * the ownership of the futex. */ /** @@ -3459,6 +3482,7 @@ static int futex_spin_on_owner(u32 __user *uaddr, u32 vpid, u32 uval; u32 owner_pid = 0; struct task_struct *owner_task = NULL; + bool on_owner_pi_list = false; WRITE_ONCE(state->owner, current); preempt_disable(); @@ -3468,13 +3492,47 @@ static int futex_spin_on_owner(u32 __user *uaddr, u32 vpid, break; if ((uval & FUTEX_TID_MASK) != owner_pid) { - if (owner_task) + if (owner_task) { + /* +* task_pi_list_del() should always be +* done before put_task_struct(). The futex +* state may have been dequeued if the task +
[PATCH v3 04/13] futex: Consolidate pure pi_state_list add & delete codes to helpers
Two new helper functions (task_pi_list_add & task_pi_list_del) are created to consolidate all the pure pi_state_list addition and insertion codes. The set_owner argument in task_pi_list_add() will be needed in a later patch. Signed-off-by: Waiman Long--- kernel/futex.c | 67 +++- 1 files changed, 42 insertions(+), 25 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index ebe59fa..0d1b1be 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -813,6 +813,42 @@ static inline int get_futex_value(u32 *dest, u32 __user *from) /* * PI code: */ + +/** + * task_pi_list_add - add futex state object to a task's pi_state_list + * @task : task structure + * @state: futex state object + * @set_owner: set futex state owner to the given task if true + */ +static inline void task_pi_list_add(struct task_struct *task, + struct futex_state *state, + const bool set_owner) +{ + raw_spin_lock_irq(>pi_lock); + WARN_ON(!list_empty(>list)); + list_add(>list, >pi_state_list); + if (set_owner) + state->owner = task; + raw_spin_unlock_irq(>pi_lock); +} + +/** + * task_pi_list_del - delete futex state object from a task's pi_state_list + * @task : task structure + * @state: futex state object + * @warn : warn if list is empty when set + */ +static inline void task_pi_list_del(struct task_struct *task, + struct futex_state *state, + const bool warn) +{ + raw_spin_lock_irq(>pi_lock); + if (warn) + WARN_ON(list_empty(>list)); + list_del_init(>list); + raw_spin_unlock_irq(>pi_lock); +} + static int refill_futex_state_cache(void) { struct futex_state *state; @@ -865,9 +901,7 @@ static void put_futex_state(struct futex_state *state) * and has cleaned up the futex state already */ if (state->owner) { - raw_spin_lock_irq(>owner->pi_lock); - list_del_init(>list); - raw_spin_unlock_irq(>owner->pi_lock); + task_pi_list_del(state->owner, state, false); rt_mutex_proxy_unlock(>pi_mutex, state->owner); } @@ -1388,16 +1422,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, return ret; } - raw_spin_lock(_state->owner->pi_lock); - WARN_ON(list_empty(_state->list)); - list_del_init(_state->list); - raw_spin_unlock(_state->owner->pi_lock); - - raw_spin_lock(_owner->pi_lock); - WARN_ON(!list_empty(_state->list)); - list_add(_state->list, _owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock(_owner->pi_lock); + task_pi_list_del(pi_state->owner, pi_state, true); + task_pi_list_add(new_owner, pi_state, true); raw_spin_unlock_irq(_state->pi_mutex.wait_lock); @@ -2202,19 +2228,10 @@ retry: * We fixed up user space. Now we need to fix the pi_state * itself. */ - if (pi_state->owner != NULL) { - raw_spin_lock_irq(_state->owner->pi_lock); - WARN_ON(list_empty(_state->list)); - list_del_init(_state->list); - raw_spin_unlock_irq(_state->owner->pi_lock); - } + if (pi_state->owner != NULL) + task_pi_list_del(pi_state->owner, pi_state, true); - pi_state->owner = newowner; - - raw_spin_lock_irq(>pi_lock); - WARN_ON(!list_empty(_state->list)); - list_add(_state->list, >pi_state_list); - raw_spin_unlock_irq(>pi_lock); + task_pi_list_add(newowner, pi_state, true); return 0; /* -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 03/13] futex: Add helpers to get & cmpxchg futex value without lock
Two new helper functions cmpxchg_futex_value() and get_futex_value() are added to access and change the futex value without the hash bucket lock. As a result, page fault is enabled and the page will be faulted in if not present yet. Signed-off-by: Waiman Long--- kernel/futex.c | 15 +++ 1 files changed, 15 insertions(+), 0 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index f4e20ad..ebe59fa 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -794,6 +794,21 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) return ret ? -EFAULT : 0; } +/* + * The equivalents of the above cmpxchg_futex_value_locked() and + * get_futex_value_locked which are called without the hash bucket lock + * and so can have page fault enabled. + */ +static inline int cmpxchg_futex_value(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) +{ + return futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); +} + +static inline int get_futex_value(u32 *dest, u32 __user *from) +{ + return __get_user(*dest, from); +} /* * PI code: -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 10/13] futex: Inform FUTEX_LOCK callers on how the lock is acquired
As there are three different ways for a TP futex waiter in the kernel to acquire the lock. It may be useful to pass this information out so that user space has a better view of what is happening in the kernel. With this change, different non-negative values will now be returned depending on how the lock is acquired in the kernel. Signed-off-by: Waiman Long--- kernel/futex.c | 30 ++ 1 files changed, 22 insertions(+), 8 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index d260410..1219f32 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3337,7 +3337,17 @@ void exit_robust_list(struct task_struct *curr) #define TP_SPIN_THRESHOLD (1 << 14) #define TP_SLEEP_DECREMENT (TP_SPIN_THRESHOLD/64) -/** +/* + * futex_lock() returned values to identify how the lock is acquired: + * 0 - steals the lock + * 1 - top waiter (mutex owner) acquires the lock + * 2 - handed off the lock + */ +#define TP_LOCK_STOLEN 0 +#define TP_LOCK_ACQUIRED 1 +#define TP_LOCK_HANDOFF2 + + /** * lookup_futex_state - Looking up the futex state structure. * @hb: hash bucket * @key:futex key @@ -3420,9 +3430,11 @@ static inline int put_futex_state_unlocked(struct futex_state *state) * preserve the flag bits * endif * - * Return: 1 if lock acquired; + * Return: TP_LOCK_ACQUIRED if lock acquired; + *TP_LOCK_HANDOFF if lock was handed off; *0 if lock acquisition failed; *-EFAULT if an error happened. + **puval will contain the latest futex value when trylock fails. */ static inline int futex_trylock(u32 __user *uaddr, u32 vpid, u32 *puval, const bool waiter) @@ -3435,7 +3447,7 @@ static inline int futex_trylock(u32 __user *uaddr, u32 vpid, u32 *puval, *puval = uval; if (waiter && (uval & FUTEX_TID_MASK) == vpid) - return 1; + return TP_LOCK_HANDOFF; if (uval & FUTEX_TID_MASK) return 0; /* Trylock fails */ @@ -3446,7 +3458,7 @@ static inline int futex_trylock(u32 __user *uaddr, u32 vpid, u32 *puval, if (unlikely(cmpxchg_futex_value(puval, uaddr, uval, vpid|flags))) return -EFAULT; - return *puval == uval; + return (*puval == uval) ? TP_LOCK_ACQUIRED : 0; } /** @@ -3659,8 +3671,10 @@ efault: * This function is not inlined so that it can show up separately in perf * profile for performance analysis purpose. * - * Return: 0 - lock acquired - *< 0 - an error happens + * Return: TP_LOCK_ACQUIRED - lock acquired normally + *TP_LOCK_HANDOFF - lock handed off directly from unlocker + *TP_LOCK_STOLEN - lock stolen + *< 0 - an error happens */ static noinline int futex_lock(u32 __user *uaddr, unsigned int flags) { @@ -3676,7 +3690,7 @@ static noinline int futex_lock(u32 __user *uaddr, unsigned int flags) */ ret = futex_trylock(uaddr, vpid, , false); if (ret) - goto out; + return (ret < 0) ? ret : TP_LOCK_STOLEN; /* * Detect deadlocks. @@ -3750,7 +3764,7 @@ out_put_state_key: put_futex_key(); out: - return (ret < 0) ? ret : 0; + return ret; } /* -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 00/13] futex: Introducing throughput-optimized futexes
v2->v3: - Use the abbreviation TP for the new futexes instead of TO. - Make a number of changes accordingly to review comments from ThomasG, PeterZ and MikeG. - Breaks the main futex patch into smaller pieces to make them easier to review. - Integrate the microbenchmark into the "perf bench futex" tool so that others can use it too. v1->v2: - Adds an explicit lock hand-off mechanism. - Adds timeout support. - Simplifies the required userspace code. - Fixes a number of problems in the v1 code. This patchset introduces a new futex implementation called throughput-optimized (TP) futexes. It is similar to PI futexes in its calling convention, but provides better throughput than the wait-wake futexes by encouraging more lock stealing and optimistic spinning especially when the lock holders don't sleep and the critical sections are not very short. A manpage patch that documents changes to the futex(2) system call will be sent out separately. Patch 1 consolidates the duplicated timer setup code. Patch 2 renames the futex_pi_state structure to futex_state as it will no longer be specific to the PI futexes. Some futex_pi_state related functions are also renamed. Patch 3 adds some helper functions to be used in later patches. Patch 4 consolidates the codes that need to add or delete futex_state to pi_state_list. Patch 5 adds a new type field to the futex_state structure. Patch 6 changes the futex_hash_bucket to include another list for futex state objects and a spinlock to guard the new list. Patch 7 is the core of this patchset and introduction of the new TP futexes. Patch 8 enables more robust handling when the futex owners died unexpectedly. Patch 9 implements the lock handoff mechanism to prevent lock starvation. Patch 10 enables the FUTEX_LOCK call to return a status code to show how the lock is being acquired. Patch 11 adds code support the specification of a timeout value in the FUTEX_LOCK call. Patch 12 adds a new documentation file about the TP futexes. Patch 13 adds the futex microbenchmark that was used to produce the performance data to the "perf bench futex" tool. Others can then use it to do their own evaluation. Waiman Long (13): futex: Consolidate duplicated timer setup code futex: Rename futex_pi_state to futex_state futex: Add helpers to get & cmpxchg futex value without lock futex: Consolidate pure pi_state_list add & delete codes to helpers futex: Add a new futex type field into futex_state futex: Allow direct attachment of futex_state objects to hash bucket futex: Throughput-optimized (TP) futexes futex: Enable robust handling of TP futexes futex: Implement lock handoff for TP futexes to prevent lock starvation futex: Inform FUTEX_LOCK callers on how the lock is acquired futex: Add timeout support to TP futexes futex, doc: TP futexes document perf bench: New microbenchmark for userspace mutex performance Documentation/00-INDEX |2 + Documentation/tp-futex.txt | 147 +++ include/linux/sched.h |4 +- include/uapi/linux/futex.h |4 + kernel/futex.c | 943 +++- tools/perf/bench/Build |1 + tools/perf/bench/bench.h |1 + tools/perf/bench/futex-mutex.c | 558 tools/perf/bench/futex.h | 25 + tools/perf/builtin-bench.c |4 + 10 files changed, 1575 insertions(+), 114 deletions(-) create mode 100644 Documentation/tp-futex.txt create mode 100644 tools/perf/bench/futex-mutex.c -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 13/13] perf bench: New microbenchmark for userspace mutex performance
This microbenchmark simulates how the use of different futex types can affect the actual performanace of userspace mutex locks. The usage is: perf bench futex mutex Three sets of simple mutex lock and unlock functions are implemented using the wait-wake, PI and TP futexes respectively. This microbenchmark then runs the locking rate measurement tests using either one of those mutexes or all of them consecutively. Signed-off-by: Waiman Long--- tools/perf/bench/Build |1 + tools/perf/bench/bench.h |1 + tools/perf/bench/futex-mutex.c | 558 tools/perf/bench/futex.h | 25 ++ tools/perf/builtin-bench.c |4 + 5 files changed, 589 insertions(+), 0 deletions(-) create mode 100644 tools/perf/bench/futex-mutex.c diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build index 60bf119..dc6dfc1 100644 --- a/tools/perf/bench/Build +++ b/tools/perf/bench/Build @@ -6,6 +6,7 @@ perf-y += futex-wake.o perf-y += futex-wake-parallel.o perf-y += futex-requeue.o perf-y += futex-lock-pi.o +perf-y += futex-mutex.o perf-$(CONFIG_X86_64) += mem-memcpy-x86-64-asm.o perf-$(CONFIG_X86_64) += mem-memset-x86-64-asm.o diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 579a592..b0632df 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -36,6 +36,7 @@ int bench_futex_wake_parallel(int argc, const char **argv, const char *prefix); int bench_futex_requeue(int argc, const char **argv, const char *prefix); /* pi futexes */ int bench_futex_lock_pi(int argc, const char **argv, const char *prefix); +int bench_futex_mutex(int argc, const char **argv, const char *prefix); #define BENCH_FORMAT_DEFAULT_STR "default" #define BENCH_FORMAT_DEFAULT 0 diff --git a/tools/perf/bench/futex-mutex.c b/tools/perf/bench/futex-mutex.c new file mode 100644 index 000..d48e198 --- /dev/null +++ b/tools/perf/bench/futex-mutex.c @@ -0,0 +1,558 @@ +/* + * Copyright (C) 2016 Waiman Long + * + * This microbenchmark simulates how the use of different futex types can + * affect the actual performanace of userspace locking primitives like mutex. + * + * The raw throughput of the futex lock and unlock calls is not a good + * indication of actual throughput of the mutex code as it may not really + * need to call into the kernel. Therefore, 3 simple mutex lock and unlock + * functions are written to implenment a mutex lock using the wait-wake, + * PI and TP futexes respectively. These functions serve as the basis for + * measuring the locking throughput. + */ + +#include + +#include +#include +#include "../util/stat.h" +#include "../perf-sys.h" +#include +#include +#include +#include +#include "bench.h" +#include "futex.h" + +#include +#include +#include + +#definegettid()syscall(SYS_gettid) +#define __cacheline_aligned__attribute__((__aligned__(64))) + +typedef u32 futex_t; +typedef void (*mutex_lock_fn_t)(futex_t *futex, int tid); +typedef void (*mutex_unlock_fn_t)(futex_t *futex, int tid); + + +struct worker { + futex_t *futex; + pthread_t thread; + + /* +* Per-thread operation statistics +*/ + unsigned int ops; /* # of locking operations */ + unsigned int locks; /* # of lock futex calls*/ + unsigned int unlocks; /* # of unlock futex calls */ + unsigned int eagains; /* # of EAGAIN errors */ + unsigned int lockerrs; /* # of other lock errors */ + unsigned int unlockerrs;/* # of unlock errors */ + unsigned int wakeups; /* # of wakeups (unlock return) */ + unsigned int handoffs; /* # of lock handoff (TP only) */ + unsigned int steals;/* # of lock steals (TP only) */ +} __cacheline_aligned; + +/* + * Global cache-aligned futex + */ +static futex_t global_futex __cacheline_aligned; + +static __thread futex_t thread_id; /* Thread ID */ +static __thread int counter; /* Sleep counter */ + +static struct worker *worker; +static unsigned int nsecs = 10; +static bool verbose, done, fshared, exit_now; +static unsigned int ncpus, nthreads; +static int futex_flag; +static const char *ftype = "all"; +static int csload = 1; +static int wratio; +struct timeval start, end, runtime; +static unsigned int worker_start; +static unsigned int threads_starting; +static struct stats throughput_stats; +static mutex_lock_fn_t mutex_lock_fn; +static mutex_unlock_fn_t mutex_unlock_fn; + +/* + * CS - Lock critical section + */ +static const struct option options[] = { + OPT_STRING ('f', "futex-type", , "type", "Specify futex type: WW, PI, TP, all (default)"), + OPT_INTEGER ('l', "load", , "Specify # of cpu_relax's inside CS, default = 1"), + OPT_UINTEGER('t', "threads",, "Specify number of threads, default = # of CPUs"), + OPT_UINTEGER('r', "runtime",,
[PATCH v3 06/13] futex: Allow direct attachment of futex_state objects to hash bucket
Currently, the futex state objects can only be located indirectly as hash bucket => futex_q => futex state Actually it can be beneficial in some cases to locate the futex state object directly from the hash bucket without the futex_q middleman. Therefore, a new list head to link the futex state objects as well as a new spinlock to manage them are added to the hash bucket. To limit size increase for UP systems, these new fields are only for SMP machines where the cacheline alignment of the hash bucket leaves it with enough empty space for the new fields. Signed-off-by: Waiman Long--- kernel/futex.c | 24 1 files changed, 24 insertions(+), 0 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index a496c01..646445f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -207,6 +207,11 @@ struct futex_state { struct list_head list; /* +* Can link to fs_head in the owning hash bucket. +*/ + struct list_head fs_list; + + /* * The PI object: */ struct rt_mutex pi_mutex; @@ -262,11 +267,24 @@ static const struct futex_q futex_q_init = { * Hash buckets are shared by all the futex_keys that hash to the same * location. Each key may have multiple futex_q structures, one for each task * waiting on a futex. + * + * Alternatively (in SMP), a key can be associated with a unique futex_state + * object where multiple waiters waiting for that futex can queue up in that + * futex_state object without using the futex_q structure. A separate + * futex_state lock (fs_lock) is used for processing those futex_state objects. */ struct futex_hash_bucket { atomic_t waiters; spinlock_t lock; struct plist_head chain; + +#ifdef CONFIG_SMP + /* +* Fields for managing futex_state object list +*/ + spinlock_t fs_lock; + struct list_head fs_head; +#endif } cacheline_aligned_in_smp; /* @@ -867,6 +885,8 @@ static int refill_futex_state_cache(void) return -ENOMEM; INIT_LIST_HEAD(>list); + INIT_LIST_HEAD(>fs_list); + /* pi_mutex gets initialized later */ state->owner = NULL; atomic_set(>refcount, 1); @@ -3356,6 +3376,10 @@ static int __init futex_init(void) atomic_set(_queues[i].waiters, 0); plist_head_init(_queues[i].chain); spin_lock_init(_queues[i].lock); +#ifdef CONFIG_SMP + INIT_LIST_HEAD(_queues[i].fs_head); + spin_lock_init(_queues[i].fs_lock); +#endif } return 0; -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH v3 01/13] futex: Consolidate duplicated timer setup code
A new futex_setup_timer() helper function is added to consolidate all the hrtimer_sleeper setup code. Signed-off-by: Waiman Long--- kernel/futex.c | 67 +++- 1 files changed, 37 insertions(+), 30 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 46cb3a3..91724ec 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -468,6 +468,35 @@ static void drop_futex_key_refs(union futex_key *key) } /** + * futex_setup_timer - set up the sleeping hrtimer. + * @time: ptr to the given timeout value + * @timeout: the hrtimer_sleeper structure to be set up + * @flags: futex flags + * @range_ns: optional range in ns + * + * Return: Initialized hrtimer_sleeper structure or NULL if no timeout + *value given + */ +static inline struct hrtimer_sleeper * +futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, + int flags, u64 range_ns) +{ + if (!time) + return NULL; + + hrtimer_init_on_stack(>timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(timeout, current); + if (range_ns) + hrtimer_set_expires_range_ns(>timer, *time, range_ns); + else + hrtimer_set_expires(>timer, *time); + + return timeout; +} + +/** * get_futex_key() - Get parameters which are the keys for a futex * @uaddr: virtual address of the futex * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED @@ -2393,7 +2422,7 @@ out: static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct restart_block *restart; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; @@ -2403,17 +2432,8 @@ static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, return -EINVAL; q.bitset = bitset; - if (abs_time) { - to = - - hrtimer_init_on_stack(>timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(>timer, *abs_time, -current->timer_slack_ns); - } - + to = futex_setup_timer(abs_time, , flags, + current->timer_slack_ns); retry: /* * Prepare to wait on uaddr. On success, holds hb lock and increments @@ -2493,7 +2513,7 @@ static long futex_wait_restart(struct restart_block *restart) static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; int res, ret; @@ -2501,13 +2521,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, if (refill_pi_state_cache()) return -ENOMEM; - if (time) { - to = - hrtimer_init_on_stack(>timer, CLOCK_REALTIME, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires(>timer, *time); - } + to = futex_setup_timer(time, , FLAGS_CLOCKRT, 0); retry: ret = get_futex_key(uaddr, flags & FLAGS_SHARED, , VERIFY_WRITE); @@ -2802,7 +2816,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 val, ktime_t *abs_time, u32 bitset, u32 __user *uaddr2) { - struct hrtimer_sleeper timeout, *to = NULL; + struct hrtimer_sleeper timeout, *to; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; struct futex_hash_bucket *hb; @@ -2816,15 +2830,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (!bitset) return -EINVAL; - if (abs_time) { - to = - hrtimer_init_on_stack(>timer, (flags & FLAGS_CLOCKRT) ? - CLOCK_REALTIME : CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(to, current); - hrtimer_set_expires_range_ns(>timer, *abs_time, -current->timer_slack_ns); - } + to = futex_setup_timer(abs_time, , flags, + current->timer_slack_ns); /* * The waiter is allocated on our stack, manipulated by
Re: [PATCH 0/2] Moving runnable code from Documentation (last 2 patches)
On 09/26/2016 12:40 PM, Shuah Khan wrote: > This patch series contains the last 2 patches to complete moving runnable > code from Documentation to selftests, samples, and tools. > > The first patch moves blackfin gptimers-example to samples and removes > CONFIG_BUILD_DOCSRC. > > The second one updates 00-INDEX files under Documentation to reflect the > move of runnable code from Documentation. > > Shuah Khan (2): > samples: move blackfin gptimers-example from Documentation > Doc: update 00-INDEX files to reflect the runnable code move > Hi Jon and Michal, Do these patches look good to you? I can get these into 4.9-rc1 with your Ack. thanks, -- Shuah -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] doc: convert 80211 DocBook to RST
Off-commit comment: this is a line-by-line, title-by-title, section-by-section, comment-by-comment conversion of the DocBook 80211.tmpl file to RST. The 80211.tmpl file has also been deleted, and removed from the DocBook/Makefile list -- I'm not sure this should be done. I'm sure that reorganization is something planned too, but a first conversion away from DocBook is a good idea anyway, and I'd need to know much more about the topic at hand to do the reorganization. Regards, -- Florian Margaine pgp66UbS6Tztv.pgp Description: OpenPGP digital signature
[PATCH] doc: convert 80211 DocBook to RST
Signed-off-by: Florian Margaine--- .../actions-and-configuration.rst | 9 + .../80211/cfg80211-subsystem/data-path-helpers.rst | 9 + .../cfg80211-subsystem/device-registration.rst | 9 + Documentation/80211/cfg80211-subsystem/index.rst | 14 + .../regulatory-enforcement-infrastructure.rst | 9 + .../cfg80211-subsystem/rfkill-integration.rst | 9 + .../scanning-and-bss-list-handling.rst | 9 + .../80211/cfg80211-subsystem/test-mode.rst | 9 + .../80211/cfg80211-subsystem/utility-functions.rst | 9 + ...reless-80211-networking-in-the-linux-kernel.rst | 47 ++ Documentation/80211/index.rst | 9 + .../advanced/access-point-mode-support.rst | 19 + .../advanced/aggregation.rst | 18 + .../advanced/beacon-filter-support.rst | 9 + .../advanced/hardware-crypto-acceleration.rst | 9 + .../advanced/hardware-scan-offload.rst | 8 + .../mac80211-developers-guide/advanced/index.rst | 21 + .../advanced/led-support.rst | 11 + .../advanced/multiple-queues-and-qos-support.rst | 8 + .../advanced/powersave-support.rst | 6 + .../advanced/spatial-multiplexing-powersave.rst| 9 + .../advanced/station-handling.rst | 8 + .../supporting-multiple-virtual-interfaces.rst | 14 + .../basic/basic-hardware-handling.rst | 22 + .../basic/frame-filtering.rst | 9 + .../mac80211-developers-guide/basic/index.rst | 20 + .../basic/phy-configuration.rst| 11 + .../basic/receive-and-transmit-processing.rst | 35 ++ .../basic/the-mac80211-workqueue.rst | 9 + .../basic/virtual-interfaces.rst | 20 + .../80211/mac80211-developers-guide/index.rst | 22 + .../internals/aggregation.rst | 6 + .../mac80211-developers-guide/internals/index.rst | 16 + .../internals/key-handling.rst | 14 + .../internals/receive-processing.rst | 5 + .../internals/station-info-handling.rst| 15 + .../internals/synchronisation.rst | 7 + .../internals/transmit-processing.rst | 3 + .../rate-control/index.rst | 12 + .../rate-control/rate-control-api.rst | 8 + Documentation/DocBook/80211.tmpl | 584 - Documentation/DocBook/Makefile | 2 +- Documentation/index.rst| 1 + 43 files changed, 518 insertions(+), 585 deletions(-) create mode 100644 Documentation/80211/cfg80211-subsystem/actions-and-configuration.rst create mode 100644 Documentation/80211/cfg80211-subsystem/data-path-helpers.rst create mode 100644 Documentation/80211/cfg80211-subsystem/device-registration.rst create mode 100644 Documentation/80211/cfg80211-subsystem/index.rst create mode 100644 Documentation/80211/cfg80211-subsystem/regulatory-enforcement-infrastructure.rst create mode 100644 Documentation/80211/cfg80211-subsystem/rfkill-integration.rst create mode 100644 Documentation/80211/cfg80211-subsystem/scanning-and-bss-list-handling.rst create mode 100644 Documentation/80211/cfg80211-subsystem/test-mode.rst create mode 100644 Documentation/80211/cfg80211-subsystem/utility-functions.rst create mode 100644 Documentation/80211/explaining-wireless-80211-networking-in-the-linux-kernel.rst create mode 100644 Documentation/80211/index.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/access-point-mode-support.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/aggregation.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/beacon-filter-support.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/hardware-crypto-acceleration.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/hardware-scan-offload.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/index.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/led-support.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/multiple-queues-and-qos-support.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/powersave-support.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/spatial-multiplexing-powersave.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/station-handling.rst create mode 100644 Documentation/80211/mac80211-developers-guide/advanced/supporting-multiple-virtual-interfaces.rst create mode 100644 Documentation/80211/mac80211-developers-guide/basic/basic-hardware-handling.rst create mode 100644
Re: md/dm-crypt: Rename a jump label in crypt_message() ?
> If you continue discussing after that point, I guess that such a condition is not needed. > then you make a clear statement that it isn't an accident. I hope that most of my software development activities are not "an accident". Is the intent for any update suggestion (like the renaming of a jump label in this case) reasonable to some degree? > You are deliberately wasting their time. I imagine that I do not really try to "waste" others time. But I am trying also to change some "things". There are circumstances when these contributions are interpreted as "wasted efforts". Is the change acceptance usually higher for other update patterns? > Something along "I will not listen. I am listening while my responses might not fit to your current expectations. > I will not change. I have got also some personal change opportunities. > Nothing you tell me will ever make it worth your time to do so"? While you can be so clear about a rejection for this software module at the moment, other contributors showed occasionally more positive information. Regards, Markus -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: utime accounting regression since 4.6 (was: Re: [PACTH v2 0/3] Implement /proc//totmaps)
On Fri, 2016-09-30 at 11:49 +0200, Michal Hocko wrote: > [CC Mike and Mel as they have seen some accounting oddities > when doing performance testing. They can share details but > essentially the system time just gets too high] > > For your reference the email thread started > http://lkml.kernel.org/r/20160823143330.gl23...@dhcp22.suse.cz > > I suspect this is mainly for short lived processes - like kernel > compile > $ /usr/bin/time -v make mm/mmap.o > [...] > User time (seconds): 0.45 > System time (seconds): 0.82 > Percent of CPU this job got: 111% > Elapsed (wall clock) time (h:mm:ss or m:ss): 0:01.14 > $ rm mm/mmap.o > $ /usr/bin/time -v make mm/mmap.o > [...] > User time (seconds): 0.47 > System time (seconds): 1.55 > Percent of CPU this job got: 107% > Elapsed (wall clock) time (h:mm:ss or m:ss): 0:01.88 I was not able to get the "expected" results from your last reproducer, but this one does happen on my system, too. The bad news is, I still have no clue what is causing it... -- All Rights Reversed. signature.asc Description: This is a digitally signed message part
Re: md/dm-crypt: Rename a jump label in crypt_message() ?
> I don't like out labels, but that's my opinion. Thanks for this acknowledgement that you have still got a special opinion about such an identifier. > There is nothing in CodingStyle which says you can't do it. Does the terse description there try to suggest also to choose better identifiers for source code places? Does the meaning of such a coding style specification include also the selection of a more pleasing identifier than "error" for this software module? Regards, Markus -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: md/dm-crypt: Rename a jump label in crypt_message() ?
SF Markus Elfringwrites: >> When someone tells you that you are wasting their time, > > This information can be useful to some degree Yes. If you continue discussing after that point, then you make a clear statement that it isn't an accident. You are deliberately wasting their time. A lot of people already know this. But you're right that it would be useful to make it even clearer. Maybe you could add a note about it to each patch? Something along "I will not listen. I will not change. Nothing you tell me will ever make it worth your time to do so"? Just an idea... Bjørn -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: md/dm-crypt: Rename a jump label in crypt_message() ?
On Fri, Sep 30, 2016 at 01:32:23PM +0200, SF Markus Elfring wrote: > > I specifically did not say that "out:" or "error:" labels are bad names. > > Did you inform me once that you had also a special opinion about an identifier > like "out"? I don't like out labels, but that's my opinion. There is nothing in CodingStyle which says you can't do it. regards, dan carpenter -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: md/dm-crypt: Rename a jump label in crypt_message() ?
> When someone tells you that you are wasting their time, This information can be useful to some degree > then that is not a subject for further discussion. I got an other impression. I guess that there are constraints for such a response which can become interesting for further considerations. Is it just usual that other software changes are more welcome? Regards, Markus -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: md/dm-crypt: Rename a jump label in crypt_message() ?
SF Markus Elfringwrites: >> go around bothering everyone with waste of time cleanup patches. > > I find it still debatable if the shown software development efforts > are really "wasted". When someone tells you that you are wasting their time, then that is not a subject for further discussion. Bjørn -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: md/dm-crypt: Rename a jump label in crypt_message() ?
> Again, I wrote the paragraph in CodingStyle. This is obvious from the corresponding commit "add some more error handling guidelines" (ea04036032edda6f771c1381d03832d2ed0f6c31 on 2014-12-02). > I just said that it's a good idea to think about label names I agree also to such a desire. > instead of using GW-BASIC style numbered labels, Is this kind of wording another weakness in the discussed document? How many guidance do programmers get from such a software specification? I came a few source code places along where I proposed corresponding changes. > I didn't say You did not say anything about some details as it is often easier to express several aspects in vague and general terms. > go around bothering everyone with waste of time cleanup patches. I find it still debatable if the shown software development efforts are really "wasted". It seems that also the Linux development community is mixed about related interpretations. > I specifically did not say that "out:" or "error:" labels are bad names. Did you inform me once that you had also a special opinion about an identifier like "out"? The C compiler will accept them as usual. But do we occasionally prefer to express implementation details a bit better there? > Those are common style in the kernel. * Which impressions can you get from a statement like "goto fail;" or "goto error;"? * Do any exception handling implementations should be reconsidered at such places? > Please stop sending these patches. Could it happen that the change acceptance will increase also for the suggested renaming of jump labels if maintainers from other subsystems would dare to respond once more in a positive way for such a software refactoring? Regards, Markus -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: utime accounting regression since 4.6 (was: Re: [PACTH v2 0/3] Implement /proc//totmaps)
[CC Mike and Mel as they have seen some accounting oddities when doing performance testing. They can share details but essentially the system time just gets too high] For your reference the email thread started http://lkml.kernel.org/r/20160823143330.gl23...@dhcp22.suse.cz I suspect this is mainly for short lived processes - like kernel compile $ /usr/bin/time -v make mm/mmap.o [...] User time (seconds): 0.45 System time (seconds): 0.82 Percent of CPU this job got: 111% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:01.14 $ rm mm/mmap.o $ /usr/bin/time -v make mm/mmap.o [...] User time (seconds): 0.47 System time (seconds): 1.55 Percent of CPU this job got: 107% Elapsed (wall clock) time (h:mm:ss or m:ss): 0:01.88 This is quite unexpected for a cache hot compile. I would expect most of the time being spent in userspace. $ perf report | grep kernel.vmlinux 2.01% as[kernel.vmlinux] [k] page_fault 0.59% cc1 [kernel.vmlinux] [k] page_fault 0.15% git [kernel.vmlinux] [k] page_fault 0.12% bash [kernel.vmlinux] [k] page_fault 0.11% sh[kernel.vmlinux] [k] page_fault 0.08% gcc [kernel.vmlinux] [k] page_fault 0.06% make [kernel.vmlinux] [k] page_fault 0.04% rm[kernel.vmlinux] [k] page_fault 0.03% ld[kernel.vmlinux] [k] page_fault 0.02% bash [kernel.vmlinux] [k] entry_SYSCALL_64 0.01% git [kernel.vmlinux] [k] entry_SYSCALL_64 0.01% cat [kernel.vmlinux] [k] page_fault 0.01% collect2 [kernel.vmlinux] [k] page_fault 0.00% sh[kernel.vmlinux] [k] entry_SYSCALL_64 0.00% rm[kernel.vmlinux] [k] entry_SYSCALL_64 0.00% grep [kernel.vmlinux] [k] page_fault doesn't show anything unexpected. Original Rik's reply follows: On Tue 23-08-16 17:46:11, Rik van Riel wrote: > On Tue, 2016-08-23 at 16:33 +0200, Michal Hocko wrote: [...] > > OK, so it seems I found it. I was quite lucky because > > account_user_time > > is not all that popular function and there were basically no changes > > besides Riks ff9a9b4c4334 ("sched, time: Switch > > VIRT_CPU_ACCOUNTING_GEN > > to jiffy granularity") and that seems to cause the regression. > > Reverting > > the commit on top of the current mmotm seems to fix the issue for me. > > > > And just to give Rik more context. While debugging overhead of the > > /proc//smaps I am getting a misleading output from /usr/bin/time > > -v > > (source for ./max_mmap is [1]) > > > > root@test1:~# uname -r > > 4.5.0-rc6-bisect1-00025-gff9a9b4c4334 > > root@test1:~# ./max_map > > pid:2990 maps:65515 > > root@test1:~# /usr/bin/time -v awk '/^Rss/{rss+=$2} /^Pss/{pss+=$2} > > END {printf "rss:%d pss:%d\n", rss, pss}' /proc/2990/smaps > > rss:263368 pss:262203 > > Command being timed: "awk /^Rss/{rss+=$2} /^Pss/{pss+=$2} END > > {printf "rss:%d pss:%d\n", rss, pss} /proc/2990/smaps" > > User time (seconds): 0.00 > > System time (seconds): 0.45 > > Percent of CPU this job got: 98% > > > > > root@test1:~# /usr/bin/time -v awk '/^Rss/{rss+=$2} /^Pss/{pss+=$2} > > END {printf "rss:%d pss:%d\n", rss, pss}' /proc/3015/smaps > > rss:263316 pss:262199 > > Command being timed: "awk /^Rss/{rss+=$2} /^Pss/{pss+=$2} END > > {printf "rss:%d pss:%d\n", rss, pss} /proc/3015/smaps" > > User time (seconds): 0.18 > > System time (seconds): 0.29 > > Percent of CPU this job got: 97% > > The patch in question makes user and system > time accounting essentially tick-based. If > jiffies changes while the task is in user > mode, time gets accounted as user time, if > jiffies changes while the task is in system > mode, time gets accounted as system time. > > If you get "unlucky", with a job like the > above, it is possible all time gets accounted > to system time. > > This would be true both with the system running > with a periodic timer tick (before and after my > patch is applied), and in nohz_idle mode (after > my patch). > > However, it does seem quite unlikely that you > get zero user time, since you have 125 timer > ticks in half a second. Furthermore, you do not > even have NO_HZ_FULL enabled... > > Does the workload consistently get zero user > time? > > If so, we need to dig further to see under > what precise circumstances that happens. > > On my laptop, with kernel 4.6.3-300.fc24.x86_64 > I get this: > > $ /usr/bin/time -v awk '/^Rss/{rss+=$2} /^Pss/{pss+=$2} END {printf > "rss:%d pss:%d\n", rss, pss}' /proc/19825/smaps > rss:263368 pss:262145 > Command being timed: "awk /^Rss/{rss+=$2} /^Pss/{pss+=$2} END > {printf "rss:%d pss:%d\n", rss, pss} /proc/19825/smaps" > User time (seconds): 0.64 > System time (seconds): 0.19 > Percent of CPU this
Re: [PATCH v8 0/8] thunderbolt: Introducing Thunderbolt(TM) Networking
On Fri, Sep 30, 2016 at 08:37:36AM +, Levy, Amir (Jer) wrote: > On Fri, Sep 30 2016, 09:40 AM, David Miller wrote: > > From: Greg KH> > Date: Fri, 30 Sep 2016 08:30:05 +0200 > > > > > On Fri, Sep 30, 2016 at 01:55:55AM -0400, David Miller wrote: > > >> From: Amir Levy > > >> Date: Wed, 28 Sep 2016 17:44:22 +0300 > > >> > > >> > This driver enables Thunderbolt Networking on non-Apple platforms > > >> > running Linux. > > >> > > >> Greg, any idea where this should get merged once fully vetted? I can > > >> take it through the net-next tree, but I'm fine with another more > > >> appropriate tree taking it as well. > > > > > > I am supposed to be taking thunderbolt patches, but if this really is > > > a network driver, it should go under drivers/net/ somewhere. It needs > > > more review though, it's not ready to go through anyone's tree just > > > yet :) > > > > > > I'll let the thunderbolt maintainer go through it first before asking > > > for a netdev review. > > > > Ok, thanks Greg. > > Greg, David, > Andreas replied to similar request on patch v6: > "This driver is independent from mine. It uses an interface provided > by the firmware which is not present on Apple hardware and with which > I am not familiar (also it does networking, not pci with which I am > also not familiar). So I cannot comment on the driver itself. I don't > mind a second driver, if that is what you are asking." Yes, but I still need an ack from the thunderbolt maintainer that you aren't doing anything foolish with that interface before I can take the code. > Note that Thunderbolt Networking is the first feature we would like to > submit, but the next features aren't related to network, but more to > Thunderbolt functionality. If this really is a real network device, it should probably live in drivers/net/ like other network drivers. > This is the reason I created the directory thunderbolt/icm, since the > next features requires ICM to be enabled as well. As long as you have ICM split out so that other drivers can use it, it should be fine, no matter where in the tree it lives, right? > I also followed the firewire as example that includes net.c (in > drivers/firewire directory) along with other firewire functionality. That's the old-style of placing files. We have moved the USB network drivers out of drivers/usb/ a while ago. The current thought is to group drivers of specific types, not busses, together wherever possible, as that is usually the majority of the logic in the driver (i.e. a USB network driver has more network-driver specific logic than USB-specific logic.) Hope this helps explain things. I'll get to your patches next week, they are in my queue at the moment, but have conferences to deal with at the moment. Don't let my delay stop you from working on further "ICM" drivers if needed, you can always send new series of patches that build on this one when you have it ready. thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH v8 0/8] thunderbolt: Introducing Thunderbolt(TM) Networking
On Fri, Sep 30 2016, 09:40 AM, David Miller wrote: > From: Greg KH> Date: Fri, 30 Sep 2016 08:30:05 +0200 > > > On Fri, Sep 30, 2016 at 01:55:55AM -0400, David Miller wrote: > >> From: Amir Levy > >> Date: Wed, 28 Sep 2016 17:44:22 +0300 > >> > >> > This driver enables Thunderbolt Networking on non-Apple platforms > >> > running Linux. > >> > >> Greg, any idea where this should get merged once fully vetted? I can > >> take it through the net-next tree, but I'm fine with another more > >> appropriate tree taking it as well. > > > > I am supposed to be taking thunderbolt patches, but if this really is > > a network driver, it should go under drivers/net/ somewhere. It needs > > more review though, it's not ready to go through anyone's tree just > > yet :) > > > > I'll let the thunderbolt maintainer go through it first before asking > > for a netdev review. > > Ok, thanks Greg. Greg, David, Andreas replied to similar request on patch v6: "This driver is independent from mine. It uses an interface provided by the firmware which is not present on Apple hardware and with which I am not familiar (also it does networking, not pci with which I am also not familiar). So I cannot comment on the driver itself. I don't mind a second driver, if that is what you are asking." Note that Thunderbolt Networking is the first feature we would like to submit, but the next features aren't related to network, but more to Thunderbolt functionality. This is the reason I created the directory thunderbolt/icm, since the next features requires ICM to be enabled as well. I also followed the firewire as example that includes net.c (in drivers/firewire directory) along with other firewire functionality. Thanks, Amir -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v8 0/8] thunderbolt: Introducing Thunderbolt(TM) Networking
On Fri, Sep 30, 2016 at 01:55:55AM -0400, David Miller wrote: > From: Amir Levy> Date: Wed, 28 Sep 2016 17:44:22 +0300 > > > This driver enables Thunderbolt Networking on non-Apple platforms > > running Linux. > > Greg, any idea where this should get merged once fully vetted? I can > take it through the net-next tree, but I'm fine with another more > appropriate tree taking it as well. I am supposed to be taking thunderbolt patches, but if this really is a network driver, it should go under drivers/net/ somewhere. It needs more review though, it's not ready to go through anyone's tree just yet :) I'll let the thunderbolt maintainer go through it first before asking for a netdev review. thanks, greg k-h -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html