From: George Guo <[email protected]>

Support per-program private stacks, advertised via
bpf_jit_supports_private_stack(). When the verifier marks a program with
jits_use_priv_stack (e.g. a sufficiently deep, potentially recursive
tracing program), its BPF stack is moved off the kernel stack into a
per-CPU allocation, reducing kernel stack pressure.

The private stack is allocated in bpf_int_jit_compile() as the
verifier-computed stack depth plus two 16-byte guard regions used to
detect overflow and underflow; the guards are initialised at allocation
time and validated in bpf_jit_free(). S5 (otherwise saved/restored but
unused by the JIT) is reused to hold the private stack pointer, loaded
in the prologue with the current CPU's per-CPU offset ($r21). When a
private stack is in use the BPF frame pointer points into this per-CPU
region and the BPF stack is no longer reserved on the kernel stack.

Signed-off-by: George Guo <[email protected]>
---
 arch/loongarch/net/bpf_jit.c | 111 ++++++++++++++++++++++++++++++++++-
 arch/loongarch/net/bpf_jit.h |   1 +
 2 files changed, 109 insertions(+), 3 deletions(-)

diff --git a/arch/loongarch/net/bpf_jit.c b/arch/loongarch/net/bpf_jit.c
index 3f9ffdde2491..c410b02e64be 100644
--- a/arch/loongarch/net/bpf_jit.c
+++ b/arch/loongarch/net/bpf_jit.c
@@ -18,8 +18,13 @@
 
 #define REG_TCC                LOONGARCH_GPR_A6
 #define REG_ARENA      LOONGARCH_GPR_S6 /* For storing arena_vm_start */
+#define REG_PRIV_SP    LOONGARCH_GPR_S5 /* For storing the private stack 
pointer */
 #define BPF_TAIL_CALL_CNT_PTR_STACK_OFF(stack) (round_up(stack, 16) - 80)
 
+/* Memory size/value to protect private stack overflow/underflow */
+#define PRIV_STACK_GUARD_SZ    16
+#define PRIV_STACK_GUARD_VAL   0xEB9F12345678eb9fULL
+
 static const int regmap[] = {
        /* return value from in-kernel function, and exit value for eBPF 
program */
        [BPF_REG_0] = LOONGARCH_GPR_A5,
@@ -40,6 +45,15 @@ static const int regmap[] = {
        [BPF_REG_AX] = LOONGARCH_GPR_T0,
 };
 
+static void emit_percpu_ptr(struct jit_ctx *ctx, u8 dst, void __percpu *ptr)
+{
+       move_imm(ctx, dst, (__force long)ptr, false);
+#ifdef CONFIG_SMP
+       /* dst += __my_cpu_offset, held in $r21 */
+       emit_insn(ctx, addd, dst, dst, LOONGARCH_GPR_U0);
+#endif
+}
+
 static void prepare_bpf_tail_call_cnt(struct jit_ctx *ctx, int *store_offset)
 {
        const struct bpf_prog *prog = ctx->prog;
@@ -141,7 +155,14 @@ static void build_prologue(struct jit_ctx *ctx)
                stack_adjust += 8;
 
        stack_adjust = round_up(stack_adjust, 16);
-       stack_adjust += bpf_stack_adjust;
+
+       /*
+        * When a private stack is used the BPF stack lives in a per-CPU
+        * allocation rather than on the kernel stack, so only the non-BPF
+        * part is reserved here.
+        */
+       if (!ctx->priv_sp_used)
+               stack_adjust += bpf_stack_adjust;
 
        move_reg(ctx, LOONGARCH_GPR_T0, LOONGARCH_GPR_RA);
        /* Reserve space for the move_imm + jirl instruction */
@@ -191,8 +212,16 @@ static void build_prologue(struct jit_ctx *ctx)
 
        emit_insn(ctx, addid, LOONGARCH_GPR_FP, LOONGARCH_GPR_SP, stack_adjust);
 
-       if (bpf_stack_adjust)
+       if (ctx->priv_sp_used) {
+               /* Set up the private stack pointer and the BPF frame pointer */
+               void __percpu *priv_stack_ptr;
+
+               priv_stack_ptr = prog->aux->priv_stack_ptr + 
PRIV_STACK_GUARD_SZ;
+               emit_percpu_ptr(ctx, REG_PRIV_SP, priv_stack_ptr);
+               emit_insn(ctx, addid, regmap[BPF_REG_FP], REG_PRIV_SP, 
bpf_stack_adjust);
+       } else if (bpf_stack_adjust) {
                emit_insn(ctx, addid, regmap[BPF_REG_FP], LOONGARCH_GPR_SP, 
bpf_stack_adjust);
+       }
 
        ctx->stack_size = stack_adjust;
 
@@ -2166,6 +2195,39 @@ int arch_bpf_trampoline_size(const struct btf_func_model 
*m, u32 flags,
        return ret < 0 ? ret : ret * LOONGARCH_INSN_SIZE;
 }
 
+static void priv_stack_init_guard(void __percpu *priv_stack_ptr, int 
alloc_size)
+{
+       int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+       u64 *stack_ptr;
+
+       for_each_possible_cpu(cpu) {
+               stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+               stack_ptr[0] = PRIV_STACK_GUARD_VAL;
+               stack_ptr[1] = PRIV_STACK_GUARD_VAL;
+               stack_ptr[underflow_idx] = PRIV_STACK_GUARD_VAL;
+               stack_ptr[underflow_idx + 1] = PRIV_STACK_GUARD_VAL;
+       }
+}
+
+static void priv_stack_check_guard(void __percpu *priv_stack_ptr, int 
alloc_size,
+                                  struct bpf_prog *prog)
+{
+       int cpu, underflow_idx = (alloc_size - PRIV_STACK_GUARD_SZ) >> 3;
+       u64 *stack_ptr;
+
+       for_each_possible_cpu(cpu) {
+               stack_ptr = per_cpu_ptr(priv_stack_ptr, cpu);
+               if (stack_ptr[0] != PRIV_STACK_GUARD_VAL ||
+                   stack_ptr[1] != PRIV_STACK_GUARD_VAL ||
+                   stack_ptr[underflow_idx] != PRIV_STACK_GUARD_VAL ||
+                   stack_ptr[underflow_idx + 1] != PRIV_STACK_GUARD_VAL) {
+                       pr_err("BPF private stack overflow/underflow detected 
for prog %sx\n",
+                              bpf_jit_get_prog_name(prog));
+                       break;
+               }
+       }
+}
+
 struct bpf_prog *bpf_int_jit_compile(struct bpf_verifier_env *env, struct 
bpf_prog *prog)
 {
        bool extra_pass = false;
@@ -2174,7 +2236,9 @@ struct bpf_prog *bpf_int_jit_compile(struct 
bpf_verifier_env *env, struct bpf_pr
        struct jit_ctx ctx;
        struct jit_data *jit_data;
        struct bpf_binary_header *header;
-       struct bpf_binary_header *ro_header;
+       struct bpf_binary_header *ro_header = NULL;
+       void __percpu *priv_stack_ptr = NULL;
+       int priv_stack_alloc_sz;
 
        /*
         * If BPF JIT was not enabled then we must fall back to
@@ -2190,6 +2254,22 @@ struct bpf_prog *bpf_int_jit_compile(struct 
bpf_verifier_env *env, struct bpf_pr
                        return prog;
                prog->aux->jit_data = jit_data;
        }
+       priv_stack_ptr = prog->aux->priv_stack_ptr;
+       if (!priv_stack_ptr && prog->aux->jits_use_priv_stack) {
+               /*
+                * Allocate the actual private stack: the verifier-calculated
+                * stack size plus two guard regions to detect overflow and
+                * underflow.
+                */
+               priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 16) +
+                                     2 * PRIV_STACK_GUARD_SZ;
+               priv_stack_ptr = __alloc_percpu_gfp(priv_stack_alloc_sz, 16, 
GFP_KERNEL);
+               if (!priv_stack_ptr)
+                       goto out_priv_stack;
+
+               priv_stack_init_guard(priv_stack_ptr, priv_stack_alloc_sz);
+               prog->aux->priv_stack_ptr = priv_stack_ptr;
+       }
        if (jit_data->ctx.offset) {
                ctx = jit_data->ctx;
                ro_header = jit_data->ro_header;
@@ -2205,6 +2285,7 @@ struct bpf_prog *bpf_int_jit_compile(struct 
bpf_verifier_env *env, struct bpf_pr
        ctx.prog = prog;
        ctx.arena_vm_start = bpf_arena_get_kern_vm_start(prog->aux->arena);
        ctx.user_vm_start = bpf_arena_get_user_vm_start(prog->aux->arena);
+       ctx.priv_sp_used = priv_stack_ptr ? true : false;
 
        ctx.offset = kvcalloc(prog->len + 1, sizeof(u32), GFP_KERNEL);
        if (ctx.offset == NULL)
@@ -2298,7 +2379,17 @@ struct bpf_prog *bpf_int_jit_compile(struct 
bpf_verifier_env *env, struct bpf_pr
                bpf_prog_fill_jited_linfo(prog, ctx.offset + 1);
 
 out_offset:
+               /*
+                * A NULL ro_header here means the JIT failed, so release the
+                * private stack that was allocated above; on success the
+                * program keeps it until bpf_jit_free().
+                */
+               if (!ro_header && priv_stack_ptr) {
+                       free_percpu(priv_stack_ptr);
+                       prog->aux->priv_stack_ptr = NULL;
+               }
                kvfree(ctx.offset);
+out_priv_stack:
                kfree(jit_data);
                prog->aux->jit_data = NULL;
        }
@@ -2324,6 +2415,8 @@ void bpf_jit_free(struct bpf_prog *prog)
        if (prog->jited) {
                struct jit_data *jit_data = prog->aux->jit_data;
                struct bpf_binary_header *hdr;
+               void __percpu *priv_stack_ptr;
+               int priv_stack_alloc_sz;
 
                /*
                 * If we fail the final pass of JIT (from jit_subprogs), the
@@ -2336,6 +2429,13 @@ void bpf_jit_free(struct bpf_prog *prog)
                }
                hdr = bpf_jit_binary_pack_hdr(prog);
                bpf_jit_binary_pack_free(hdr, NULL);
+               priv_stack_ptr = prog->aux->priv_stack_ptr;
+               if (priv_stack_ptr) {
+                       priv_stack_alloc_sz = round_up(prog->aux->stack_depth, 
16) +
+                                             2 * PRIV_STACK_GUARD_SZ;
+                       priv_stack_check_guard(priv_stack_ptr, 
priv_stack_alloc_sz, prog);
+                       free_percpu(prog->aux->priv_stack_ptr);
+               }
                WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(prog));
        }
 
@@ -2382,6 +2482,11 @@ bool bpf_jit_supports_fsession(void)
        return true;
 }
 
+bool bpf_jit_supports_private_stack(void)
+{
+       return true;
+}
+
 /* Indicate the JIT backend supports mixing bpf2bpf and tailcalls. */
 bool bpf_jit_supports_subprog_tailcalls(void)
 {
diff --git a/arch/loongarch/net/bpf_jit.h b/arch/loongarch/net/bpf_jit.h
index a8e29be35fa8..01a7ea47e79b 100644
--- a/arch/loongarch/net/bpf_jit.h
+++ b/arch/loongarch/net/bpf_jit.h
@@ -22,6 +22,7 @@ struct jit_ctx {
        u32 stack_size;
        u64 arena_vm_start;
        u64 user_vm_start;
+       bool priv_sp_used;
 };
 
 struct jit_data {
-- 
2.25.1


Reply via email to