From: Yongliang Gao <[email protected]> When the system has many cores and task switching is frequent, setting set_ftrace_pid can cause frequent pid_list->lock contention and high system sys usage.
For example, in a 288-core VM environment, we observed 267 CPUs experiencing contention on pid_list->lock, with stack traces showing: #4 [ffffa6226fb4bc70] native_queued_spin_lock_slowpath at ffffffff99cd4b7e #5 [ffffa6226fb4bc90] _raw_spin_lock_irqsave at ffffffff99cd3e36 #6 [ffffa6226fb4bca0] trace_pid_list_is_set at ffffffff99267554 #7 [ffffa6226fb4bcc0] trace_ignore_this_task at ffffffff9925c288 #8 [ffffa6226fb4bcd8] ftrace_filter_pid_sched_switch_probe at ffffffff99246efe #9 [ffffa6226fb4bcf0] __schedule at ffffffff99ccd161 Replaces the existing spinlock with a seqlock to allow concurrent readers, while maintaining write exclusivity. --- Changes from v1: - Fixed sleep-in-atomic issues under PREEMPT_RT. [Steven Rostedt] - Link to v1: https://lore.kernel.org/all/[email protected] --- Signed-off-by: Yongliang Gao <[email protected]> Reviewed-by: Huang Cun <[email protected]> --- kernel/trace/pid_list.c | 77 ++++++++++++++++++++++++++++------------- kernel/trace/pid_list.h | 1 + 2 files changed, 53 insertions(+), 25 deletions(-) diff --git a/kernel/trace/pid_list.c b/kernel/trace/pid_list.c index 090bb5ea4a19..8921102543ff 100644 --- a/kernel/trace/pid_list.c +++ b/kernel/trace/pid_list.c @@ -2,7 +2,7 @@ /* * Copyright (C) 2021 VMware Inc, Steven Rostedt <[email protected]> */ -#include <linux/spinlock.h> +#include <linux/seqlock.h> #include <linux/irq_work.h> #include <linux/slab.h> #include "trace.h" @@ -126,7 +126,7 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) { union upper_chunk *upper_chunk; union lower_chunk *lower_chunk; - unsigned long flags; + unsigned int seq; unsigned int upper1; unsigned int upper2; unsigned int lower; @@ -138,14 +138,16 @@ bool trace_pid_list_is_set(struct trace_pid_list *pid_list, unsigned int pid) if (pid_split(pid, &upper1, &upper2, &lower) < 0) return false; - raw_spin_lock_irqsave(&pid_list->lock, flags); - upper_chunk = pid_list->upper[upper1]; - if (upper_chunk) { - lower_chunk = upper_chunk->data[upper2]; - if (lower_chunk) - ret = test_bit(lower, lower_chunk->data); - } - raw_spin_unlock_irqrestore(&pid_list->lock, flags); + do { + seq = read_seqcount_begin(&pid_list->seqcount); + ret = false; + upper_chunk = pid_list->upper[upper1]; + if (upper_chunk) { + lower_chunk = upper_chunk->data[upper2]; + if (lower_chunk) + ret = test_bit(lower, lower_chunk->data); + } + } while (read_seqcount_retry(&pid_list->seqcount, seq)); return ret; } @@ -178,6 +180,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) { upper_chunk = get_upper_chunk(pid_list); @@ -199,6 +202,7 @@ int trace_pid_list_set(struct trace_pid_list *pid_list, unsigned int pid) set_bit(lower, lower_chunk->data); ret = 0; out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return ret; } @@ -230,6 +234,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) return -EINVAL; raw_spin_lock_irqsave(&pid_list->lock, flags); + write_seqcount_begin(&pid_list->seqcount); upper_chunk = pid_list->upper[upper1]; if (!upper_chunk) goto out; @@ -250,6 +255,7 @@ int trace_pid_list_clear(struct trace_pid_list *pid_list, unsigned int pid) } } out: + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock_irqrestore(&pid_list->lock, flags); return 0; } @@ -271,7 +277,6 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, { union upper_chunk *upper_chunk; union lower_chunk *lower_chunk; - unsigned long flags; unsigned int upper1; unsigned int upper2; unsigned int lower; @@ -282,27 +287,44 @@ int trace_pid_list_next(struct trace_pid_list *pid_list, unsigned int pid, if (pid_split(pid, &upper1, &upper2, &lower) < 0) return -EINVAL; - raw_spin_lock_irqsave(&pid_list->lock, flags); - for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) { - upper_chunk = pid_list->upper[upper1]; + do { + unsigned int start_upper1 = upper1; + unsigned int start_upper2 = upper2; + unsigned int start_lower = lower; + unsigned int seq; - if (!upper_chunk) - continue; + seq = read_seqcount_begin(&pid_list->seqcount); - for (; upper2 <= UPPER_MASK; upper2++, lower = 0) { - lower_chunk = upper_chunk->data[upper2]; - if (!lower_chunk) + for (; upper1 <= UPPER_MASK; upper1++, upper2 = 0) { + upper_chunk = pid_list->upper[upper1]; + + if (!upper_chunk) continue; - lower = find_next_bit(lower_chunk->data, LOWER_MAX, - lower); - if (lower < LOWER_MAX) - goto found; + for (; upper2 <= UPPER_MASK; upper2++, lower = 0) { + lower_chunk = upper_chunk->data[upper2]; + if (!lower_chunk) + continue; + + lower = find_next_bit(lower_chunk->data, LOWER_MAX, + lower); + if (lower < LOWER_MAX) + goto found; + } } - } + upper1 = UPPER_MASK + 1; /* Mark as not found */ found: - raw_spin_unlock_irqrestore(&pid_list->lock, flags); + if (read_seqcount_retry(&pid_list->seqcount, seq)) { + /* Retry if write happened during read */ + upper1 = start_upper1; + upper2 = start_upper2; + lower = start_lower; + continue; + } + break; + } while (1); + if (upper1 > UPPER_MASK) return -1; @@ -340,8 +362,10 @@ static void pid_list_refill_irq(struct irq_work *iwork) again: raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); upper_count = CHUNK_ALLOC - pid_list->free_upper_chunks; lower_count = CHUNK_ALLOC - pid_list->free_lower_chunks; + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); if (upper_count <= 0 && lower_count <= 0) @@ -370,6 +394,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) } raw_spin_lock(&pid_list->lock); + write_seqcount_begin(&pid_list->seqcount); if (upper) { *upper_next = pid_list->upper_list; pid_list->upper_list = upper; @@ -380,6 +405,7 @@ static void pid_list_refill_irq(struct irq_work *iwork) pid_list->lower_list = lower; pid_list->free_lower_chunks += lcnt; } + write_seqcount_end(&pid_list->seqcount); raw_spin_unlock(&pid_list->lock); /* @@ -419,6 +445,7 @@ struct trace_pid_list *trace_pid_list_alloc(void) init_irq_work(&pid_list->refill_irqwork, pid_list_refill_irq); raw_spin_lock_init(&pid_list->lock); + seqcount_raw_spinlock_init(&pid_list->seqcount, &pid_list->lock); for (i = 0; i < CHUNK_ALLOC; i++) { union upper_chunk *chunk; diff --git a/kernel/trace/pid_list.h b/kernel/trace/pid_list.h index 62e73f1ac85f..0b45fb0eadb9 100644 --- a/kernel/trace/pid_list.h +++ b/kernel/trace/pid_list.h @@ -76,6 +76,7 @@ union upper_chunk { }; struct trace_pid_list { + seqcount_raw_spinlock_t seqcount; raw_spinlock_t lock; struct irq_work refill_irqwork; union upper_chunk *upper[UPPER1_SIZE]; // 1 or 2K in size -- 2.43.5
