From: zhidao su <[email protected]> Add basic selftests for the Proxy Execution (PE) feature (CONFIG_SCHED_PROXY_EXEC). Three test cases exercise the single-CPU PE path which is present in the current upstream kernel independently of the donor migration series (v24).
TC-1: Single-level mutex blocking A SCHED_FIFO prio=80 thread blocks on a mutex held by a SCHED_OTHER thread doing CPU-intensive work. Verifies that the holder accumulates significant CPU time (>= 50ms out of 200ms hold period), confirming PE is running the holder as proxy for the high-priority waiter. TC-2: blocked_on lifetime - no voluntary context switches While a high-priority thread is PE-blocked on a mutex, its voluntary_ctxt_switches count must not increase. PE keeps the donor on the runqueue rather than doing a voluntary sleep, so no voluntary switch should occur during the block period. TC-3: Two-level mutex chain traversal A (prio=80) -> mutex1 -> B (prio=50) -> mutex2 -> C (SCHED_OTHER). Verifies PE traverses the full chain and runs C as proxy, confirmed by C accumulating >= 50ms CPU time while A and B are both blocked. The test skips gracefully when: - CONFIG_SCHED_PROXY_EXEC is not compiled in - sched_proxy_exec=0 is set on the kernel command line - not running as root (SCHED_FIFO requires CAP_SYS_NICE) These tests cover the single-CPU PE base functionality and are orthogonal to the cross-CPU donor migration work (v24). They should remain valid after donor migration lands, as the single-CPU path is preserved. Tested on Linux 7.0-rc2 with CONFIG_SCHED_PROXY_EXEC=y via virtme-ng on Intel Core i7-10700 @ 2.90GHz: sched_proxy_exec=1: TC-1 PASS, TC-2 PASS, TC-3 PASS sched_proxy_exec=0: all SKIP Signed-off-by: zhidao su <[email protected]> --- tools/testing/selftests/sched/Makefile | 6 +- tools/testing/selftests/sched/pe_mutex_test.c | 508 ++++++++++++++++++ 2 files changed, 511 insertions(+), 3 deletions(-) create mode 100644 tools/testing/selftests/sched/pe_mutex_test.c diff --git a/tools/testing/selftests/sched/Makefile b/tools/testing/selftests/sched/Makefile index 099ee921355..5ecfa45a103 100644 --- a/tools/testing/selftests/sched/Makefile +++ b/tools/testing/selftests/sched/Makefile @@ -6,9 +6,9 @@ endif CFLAGS += -O2 -Wall -g -I./ $(KHDR_INCLUDES) -Wl,-rpath=./ \ $(CLANG_FLAGS) -LDLIBS += -lpthread +LDLIBS += -lpthread -lrt -TEST_GEN_FILES := cs_prctl_test -TEST_PROGS := cs_prctl_test +TEST_GEN_FILES := cs_prctl_test pe_mutex_test +TEST_PROGS := cs_prctl_test pe_mutex_test include ../lib.mk diff --git a/tools/testing/selftests/sched/pe_mutex_test.c b/tools/testing/selftests/sched/pe_mutex_test.c new file mode 100644 index 00000000000..b3ff4852ddc --- /dev/null +++ b/tools/testing/selftests/sched/pe_mutex_test.c @@ -0,0 +1,508 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Proxy Execution (PE) mutex selftest - TC-1 through TC-3 + * + * Verifies basic PE behavior for mutex blocking: + * TC-1: High-priority blocked task's CPU time increases via PE + * TC-2: blocked_on lifetime - voluntary ctxt switches don't increase + * TC-3: Two-level mutex chain traversal + * + * Requires CONFIG_SCHED_PROXY_EXEC=y and root privileges. + */ + +#define _GNU_SOURCE +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <sched.h> +#include <signal.h> +#include <stdatomic.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/syscall.h> + +#include "../kselftest.h" + +/* ---------- helpers ---------- */ + +static pid_t gettid_compat(void) +{ + return (pid_t)syscall(SYS_gettid); +} + +/* + * is_proxy_exec_enabled - check whether PE is active at runtime. + * + * PE has no sysctl; it is controlled by the "sched_proxy_exec" boot + * parameter. DEFINE_STATIC_KEY_TRUE means it defaults ON unless + * "sched_proxy_exec=0" appears on the kernel command line. + */ +static bool is_proxy_exec_enabled(void) +{ + char line[4096]; + FILE *f; + + f = fopen("/proc/cmdline", "r"); + if (!f) + return true; /* assume enabled if we cannot read cmdline */ + + if (!fgets(line, sizeof(line), f)) { + fclose(f); + return true; + } + fclose(f); + + return !strstr(line, "sched_proxy_exec=0"); +} + +/* Return monotonic time in nanoseconds. */ +static long long now_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +/* Return CPU time consumed by the calling thread in nanoseconds. */ +static long long cputime_ns(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts); + return (long long)ts.tv_sec * 1000000000LL + ts.tv_nsec; +} + +/* + * get_voluntary_ctxt_switches - read voluntary_ctxt_switches for @tid. + * + * Threads (tid != tgid) are only visible under + * /proc/<tgid>/task/<tid>/status, not /proc/<tid>/status directly. + * Try the task path first, fall back to the top-level pid path. + */ +static long get_voluntary_ctxt_switches(pid_t tid) +{ + char path[128]; + char line[256]; + FILE *f; + long val = -1; + + /* Try /proc/<tgid>/task/<tid>/status (works for all threads) */ + snprintf(path, sizeof(path), "/proc/%d/task/%d/status", + (int)getpid(), (int)tid); + f = fopen(path, "r"); + if (!f) { + /* Fallback: /proc/<tid>/status (works only for tgid == tid) */ + snprintf(path, sizeof(path), "/proc/%d/status", (int)tid); + f = fopen(path, "r"); + } + if (!f) + return -1; + + while (fgets(line, sizeof(line), f)) { + if (strncmp(line, "voluntary_ctxt_switches:", 24) == 0) { + val = strtol(line + 24, NULL, 10); + break; + } + } + fclose(f); + return val; +} + +/* Set SCHED_FIFO priority for the calling thread. */ +static int set_fifo(int prio) +{ + struct sched_param sp = { .sched_priority = prio }; + + return sched_setscheduler(0, SCHED_FIFO, &sp); +} + +/* Set SCHED_OTHER (normal) for the calling thread. */ +static int set_normal(void) +{ + struct sched_param sp = { .sched_priority = 0 }; + + return sched_setscheduler(0, SCHED_OTHER, &sp); +} + +/* ---------- TC-1 ---------------------------------------------------------- + * + * Single-level PE: high-priority waiter gets CPU via PE. + * + * Setup: + * - LOW thread (SCHED_OTHER): holds mutex, burns CPU for ~200 ms, + * then releases. + * - HIGH thread (SCHED_FIFO prio=80): waits for mutex immediately. + * + * On a PE kernel the scheduler runs LOW as proxy for HIGH, so LOW + * should accumulate significant CPU time (measured via + * CLOCK_PROCESS_CPUTIME_ID inside the holder thread itself). + * + * Verification: CPU time consumed by the LOW thread during the hold + * period is >= 50 ms. CLOCK_THREAD_CPUTIME_ID is used so that only + * LOW's own CPU consumption is measured, not that of other threads. + */ + +#define TC1_HOLD_MS 200 /* ms LOW holds the mutex */ +#define TC1_CPU_THRESHOLD_MS 50 /* minimum CPU ms we expect */ + +struct tc1_args { + pthread_mutex_t *mtx; + long long cpu_during_hold_ns; /* output: CPU ns consumed by LOW */ + atomic_int ready; + atomic_int done; +}; + +static void *tc1_low_thread(void *arg) +{ + struct tc1_args *a = arg; + long long t0, t1, deadline; + + /* Become the LOW thread */ + set_normal(); + + pthread_mutex_lock(a->mtx); + a->ready = 1; + + /* Spin for TC1_HOLD_MS real-time milliseconds while holding lock */ + deadline = now_ns() + (long long)TC1_HOLD_MS * 1000000LL; + t0 = cputime_ns(); + while (now_ns() < deadline) + ; /* busy wait */ + t1 = cputime_ns(); + + a->cpu_during_hold_ns = t1 - t0; + pthread_mutex_unlock(a->mtx); + a->done = 1; + return NULL; +} + +static void *tc1_high_thread(void *arg) +{ + struct tc1_args *a = arg; + + /* Become HIGH priority */ + set_fifo(80); + + /* Wait until LOW has the lock */ + while (!a->ready) + sched_yield(); + + /* Block on mutex - PE should now proxy-run LOW */ + pthread_mutex_lock(a->mtx); + pthread_mutex_unlock(a->mtx); + return NULL; +} + +static void run_tc1(void) +{ + pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; + struct tc1_args args = { .mtx = &mtx, .ready = 0, .done = 0 }; + pthread_t low, high; + long long threshold_ns = (long long)TC1_CPU_THRESHOLD_MS * 1000000LL; + + pthread_create(&low, NULL, tc1_low_thread, &args); + + /* Wait for LOW to acquire the lock before creating HIGH */ + while (!args.ready) + sched_yield(); + + pthread_create(&high, NULL, tc1_high_thread, &args); + + pthread_join(high, NULL); + pthread_join(low, NULL); + + pthread_mutex_destroy(&mtx); + + if (args.cpu_during_hold_ns >= threshold_ns) { + ksft_test_result_pass( + "TC-1: PE ran LOW as proxy (cpu_hold=%lld ms >= %d ms)\n", + args.cpu_during_hold_ns / 1000000, + TC1_CPU_THRESHOLD_MS); + } else { + ksft_test_result_fail( + "TC-1: LOW did not get enough CPU time (cpu_hold=%lld ms < %d ms)\n", + args.cpu_during_hold_ns / 1000000, + TC1_CPU_THRESHOLD_MS); + } +} + +/* ---------- TC-2 ---------------------------------------------------------- + * + * blocked_on lifetime: voluntary context switches must NOT increase + * for the high-priority waiter while it is proxy-blocked. + * + * When PE is active the high-priority task stays on the runqueue + * (as donor) and is never voluntarily context-switched out. + * + * Verification: + * Record voluntary_ctxt_switches for HIGH before and after the + * blocking period; they should be equal. + */ + +#define TC2_HOLD_MS 150 + +struct tc2_args { + pthread_mutex_t *mtx; + pid_t high_tid; + atomic_int low_has_lock; /* LOW signals it holds the mutex */ + atomic_int high_blocking; /* HIGH signals it is about to block */ + long ctxt_after; /* HIGH records its own switches after unblock */ +}; + +static void *tc2_low_thread(void *arg) +{ + struct tc2_args *a = arg; + long long deadline; + + set_normal(); + pthread_mutex_lock(a->mtx); + a->low_has_lock = 1; + + deadline = now_ns() + (long long)TC2_HOLD_MS * 1000000LL; + while (now_ns() < deadline) + ; /* busy spin holding the lock */ + + pthread_mutex_unlock(a->mtx); + return NULL; +} + +static void *tc2_high_thread(void *arg) +{ + struct tc2_args *a = arg; + + set_fifo(80); + a->high_tid = gettid_compat(); + + /* Wait until LOW holds the lock */ + while (!a->low_has_lock) + sched_yield(); + + /* Signal main that we are about to block, then immediately block */ + a->high_blocking = 1; + pthread_mutex_lock(a->mtx); + pthread_mutex_unlock(a->mtx); + /* Record our own ctxt switches before exiting (proc entry still live) */ + a->ctxt_after = get_voluntary_ctxt_switches(gettid_compat()); + return NULL; +} + +static void run_tc2(void) +{ + pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; + struct tc2_args args = { + .mtx = &mtx, + .high_tid = 0, + .low_has_lock = 0, + .high_blocking = 0, + .ctxt_after = -1, + }; + pthread_t low, high; + long before, after = -1; + + /* Start LOW first so it grabs the lock */ + pthread_create(&low, NULL, tc2_low_thread, &args); + + while (!args.low_has_lock) + sched_yield(); + + pthread_create(&high, NULL, tc2_high_thread, &args); + + /* + * Wait until HIGH has set high_tid AND signaled it is about to block. + * There is a tiny window between high_blocking=1 and the actual + * pthread_mutex_lock() call, but that is unavoidable in userspace. + * Sample "before" here; HIGH cannot have voluntarily yielded yet + * because it has not blocked yet. + */ + while (!args.high_tid || !args.high_blocking) + sched_yield(); + + /* Sample voluntary switches while HIGH is (about to be) blocked */ + before = get_voluntary_ctxt_switches(args.high_tid); + + pthread_join(high, NULL); + pthread_join(low, NULL); + + after = args.ctxt_after; + + pthread_mutex_destroy(&mtx); + + if (before < 0 || after < 0) { + ksft_test_result_skip( + "TC-2: Could not read /proc task status\n"); + return; + } + + if (after == before) { + ksft_test_result_pass( + "TC-2: HIGH voluntary_ctxt_switches unchanged (%ld) during PE block\n", + before); + } else { + ksft_test_result_fail( + "TC-2: HIGH voluntary_ctxt_switches changed: before=%ld after=%ld\n", + before, after); + } +} + +/* ---------- TC-3 ---------------------------------------------------------- + * + * Two-level mutex chain: + * A (SCHED_FIFO prio=80) -> blocked on mutex1 -> held by + * B (SCHED_FIFO prio=50) -> blocked on mutex2 -> held by + * C (SCHED_OTHER) ^^ PE must traverse + * the chain and run C + * + * Verification: C's CPU time during the hold period is >= 50 ms, + * meaning PE reached the end of the chain and ran C as proxy. + */ + +#define TC3_HOLD_MS 200 +#define TC3_CPU_THRESHOLD_MS 50 + +struct tc3_args { + pthread_mutex_t *mtx1; /* A waits on this; B holds */ + pthread_mutex_t *mtx2; /* B waits on this; C holds */ + + atomic_int b_has_mtx1; /* B has acquired mtx1 */ + atomic_int c_has_mtx2; /* C has acquired mtx2 */ + + long long c_cpu_during_hold_ns; +}; + +static void *tc3_c_thread(void *arg) +{ + struct tc3_args *a = arg; + long long t0, t1, deadline; + + set_normal(); + pthread_mutex_lock(a->mtx2); + a->c_has_mtx2 = 1; + + /* Spin holding mtx2 */ + deadline = now_ns() + (long long)TC3_HOLD_MS * 1000000LL; + t0 = cputime_ns(); + while (now_ns() < deadline) + ; + t1 = cputime_ns(); + + a->c_cpu_during_hold_ns = t1 - t0; + pthread_mutex_unlock(a->mtx2); + return NULL; +} + +static void *tc3_b_thread(void *arg) +{ + struct tc3_args *a = arg; + + set_fifo(50); + + /* Acquire mtx1 first, so A will block on it */ + pthread_mutex_lock(a->mtx1); + a->b_has_mtx1 = 1; + + /* Wait until C holds mtx2 before blocking on it */ + while (!a->c_has_mtx2) + sched_yield(); + + /* Now block on mtx2 - chain: A->mtx1->B->mtx2->C */ + pthread_mutex_lock(a->mtx2); + pthread_mutex_unlock(a->mtx2); + + pthread_mutex_unlock(a->mtx1); + return NULL; +} + +static void *tc3_a_thread(void *arg) +{ + struct tc3_args *a = arg; + + set_fifo(80); + + /* Wait until the full chain is established */ + while (!a->b_has_mtx1 || !a->c_has_mtx2) + sched_yield(); + + pthread_mutex_lock(a->mtx1); + pthread_mutex_unlock(a->mtx1); + return NULL; +} + +static void run_tc3(void) +{ + pthread_mutex_t mtx1 = PTHREAD_MUTEX_INITIALIZER; + pthread_mutex_t mtx2 = PTHREAD_MUTEX_INITIALIZER; + struct tc3_args args = { + .mtx1 = &mtx1, + .mtx2 = &mtx2, + .b_has_mtx1 = 0, + .c_has_mtx2 = 0, + }; + pthread_t ta, tb, tc; + long long threshold_ns = (long long)TC3_CPU_THRESHOLD_MS * 1000000LL; + + /* Start C first so it grabs mtx2 */ + pthread_create(&tc, NULL, tc3_c_thread, &args); + + /* Wait for C to hold mtx2 */ + while (!args.c_has_mtx2) + sched_yield(); + + /* Start B - it will grab mtx1 then block on mtx2 */ + pthread_create(&tb, NULL, tc3_b_thread, &args); + + /* Wait for B to hold mtx1 */ + while (!args.b_has_mtx1) + sched_yield(); + + /* Start A - highest priority, blocks on mtx1 */ + pthread_create(&ta, NULL, tc3_a_thread, &args); + + pthread_join(ta, NULL); + pthread_join(tb, NULL); + pthread_join(tc, NULL); + + pthread_mutex_destroy(&mtx1); + pthread_mutex_destroy(&mtx2); + + if (args.c_cpu_during_hold_ns >= threshold_ns) { + ksft_test_result_pass( + "TC-3: PE traversed 2-level chain, C got cpu_hold=%lld ms >= %d ms\n", + args.c_cpu_during_hold_ns / 1000000, + TC3_CPU_THRESHOLD_MS); + } else { + ksft_test_result_fail( + "TC-3: C did not get enough CPU (chain traversal failed?): %lld ms < %d ms\n", + args.c_cpu_during_hold_ns / 1000000, + TC3_CPU_THRESHOLD_MS); + } +} + +/* ---------- main ---------------------------------------------------------- */ + +int main(void) +{ + ksft_print_header(); + +#ifndef CONFIG_SCHED_PROXY_EXEC + ksft_exit_skip("CONFIG_SCHED_PROXY_EXEC not enabled\n"); +#endif + + if (getuid() != 0) + ksft_exit_skip("requires root (needed for SCHED_FIFO)\n"); + + if (!is_proxy_exec_enabled()) + ksft_exit_skip("sched_proxy_exec=0 on kernel cmdline, PE disabled\n"); + + ksft_set_plan(3); + + run_tc1(); + run_tc2(); + run_tc3(); + + ksft_finished(); +} -- 2.43.0

