> Correctly align stack pointer on x86 JIT if external calls are present. > > Add tests for external calls from BPF program demonstrating the problem: > * direct verification of a local variable alignment; > * operations with 128-bit integers; > * aligned and unaligned SSE2 instructions; > * memcpy and rte_memcpy (may use vector instructions in their code). > > (Such variety is needed because not all of these tests are available or > reproduce the problem on all targets even when the problem exists.)
Probably worth to mention in the commit header, that by default x86_64 ABI expects sp to be 16B aligned on function entry. Acked-by: Konstantin Ananyev <[email protected]> Tested-by: Konstantin Ananyev <[email protected]> > > Fixes: cc752e43e079 ("bpf: add JIT compilation for x86_64 ISA") > > Signed-off-by: Marat Khalili <[email protected]> > --- > app/test/test_bpf.c | 386 ++++++++++++++++++++++++++++++++++++++++++ > lib/bpf/bpf_jit_x86.c | 15 +- > 2 files changed, 400 insertions(+), 1 deletion(-) > > diff --git a/app/test/test_bpf.c b/app/test/test_bpf.c > index b7c94ba1c7..e3f6b21a8b 100644 > --- a/app/test/test_bpf.c > +++ b/app/test/test_bpf.c > @@ -3280,6 +3280,392 @@ test_bpf(void) > > REGISTER_FAST_TEST(bpf_autotest, true, true, test_bpf); > > +/* Tests of BPF JIT stack alignment when calling external functions > (xfuncs). */ > + > +/* Function called from the BPF program in a test. */ > +typedef uint64_t (*text_xfunc_t)(uint64_t argument); > + > +/* Call function from BPF program, verify that it incremented its argument. > */ > +static int > +call_from_bpf_test(text_xfunc_t xfunc) > +{ > + static const struct ebpf_insn ins[] = { > + { > + .code = (BPF_JMP | EBPF_CALL), > + .imm = 0, /* xsym #0 */ > + }, > + { > + .code = (BPF_JMP | EBPF_EXIT), > + }, > + }; > + const struct rte_bpf_xsym xsym[] = { > + { > + .name = "xfunc", > + .type = RTE_BPF_XTYPE_FUNC, > + .func = { > + .val = (void *)xfunc, > + .nb_args = 1, > + .args = { > + { > + .type = RTE_BPF_ARG_RAW, > + .size = sizeof(uint64_t), > + }, > + }, > + .ret = { > + .type = RTE_BPF_ARG_RAW, > + .size = sizeof(uint64_t), > + }, > + }, > + }, > + }; > + const struct rte_bpf_prm prm = { > + .ins = ins, > + .nb_ins = RTE_DIM(ins), > + .xsym = xsym, > + .nb_xsym = RTE_DIM(xsym), > + .prog_arg = { > + .type = RTE_BPF_ARG_RAW, > + .size = sizeof(uint64_t), > + }, > + }; > + > + struct rte_bpf_jit jit; > + > + struct rte_bpf *const bpf = rte_bpf_load(&prm); > + RTE_TEST_ASSERT_NOT_EQUAL(bpf, NULL, > + "expect rte_bpf_load() != NULL"); > + > + RTE_TEST_ASSERT_SUCCESS(rte_bpf_get_jit(bpf, &jit), > + "expect rte_bpf_get_jit() to succeed"); > + > + const text_xfunc_t jit_function = (void *)jit.func; > + if (jit_function == NULL) { > + rte_bpf_destroy(bpf); > + return TEST_SKIPPED; > + } > + > + const uint64_t argument = 42; > + const uint64_t result = jit_function(argument); > + rte_bpf_destroy(bpf); > + > + RTE_TEST_ASSERT_EQUAL(result, argument + 1, > + "expect result == %ju, found %ju", > + (uintmax_t)(argument + 1), (uintmax_t)result); > + > + return TEST_SUCCESS; > +} > + > +/* > + * Test alignment of a local variable. > + * > + * NOTE: May produce false negatives with sanitizers if they replace the > stack. > + */ > + > +/* Copy of the pointer to max_align stack variable, volatile to thwart > optimization. > */ > +static volatile uintptr_t stack_alignment_test_pointer; > + > +static uint64_t > +stack_alignment_xfunc(uint64_t argument) > +{ > + max_align_t max_align; > + stack_alignment_test_pointer = (uintptr_t)&max_align; > + return argument + 1; > +} > + > +static int > +test_stack_alignment(void) > +{ > + const int test_rc = call_from_bpf_test(stack_alignment_xfunc); > + if (test_rc == TEST_SKIPPED) > + return TEST_SKIPPED; > + > + RTE_TEST_ASSERT_SUCCESS(test_rc, > + "expect call_from_bpf_test(stack_alignment_xfunc) to succeed"); > + > + const uintptr_t test_offset = stack_alignment_test_pointer; > + RTE_TEST_ASSERT_NOT_EQUAL(test_offset, 0, "expect test_pointer != 0"); > + > + const size_t test_alignment = test_offset % alignof(max_align_t); > + RTE_TEST_ASSERT_EQUAL(test_alignment, 0, > + "expect test_alignment == 0, found %zu", test_alignment); > + > + return TEST_SUCCESS; > +} > + > +REGISTER_FAST_TEST(bpf_stack_alignment_autotest, true, true, > test_stack_alignment); > + > +/* > + * Test copying `__uint128_t`. > + * > + * This operation is used by some variations of `rte_memcpy`; > + * it can also be produced by vectorizer in the compiler. > + */ > + > +#if defined(__SIZEOF_INT128__) > + > +static uint64_t > +stack_copy_uint128_xfunc(uint64_t argument) > +{ > + /* Pass addresses through volatiles to prevent compiler from optimizing > it all > out. */ > + char alignas(16) src_buffer[16]; > + char alignas(16) dst_buffer[16]; > + void *const src = (char *volatile)src_buffer; > + void *const dst = (char *volatile)dst_buffer; > + const size_t size = 16; > + > + memset(src, 0x2a, size); > + memset(dst, 0x55, size); > + const int initial_memcmp_rc = memcmp(dst, src, size); > + > + const __uint128_t *const src128 = (const __uint128_t *)src; > + __uint128_t *const dst128 = (__uint128_t *)dst; > + *dst128 = *src128; > + const int memcmp_rc = memcmp(dst, src, size); > + > + return argument + 1 + !initial_memcmp_rc + memcmp_rc; > +} > + > +static int > +test_stack_copy_uint128(void) > +{ > + const int test_rc = call_from_bpf_test(stack_copy_uint128_xfunc); > + if (test_rc == TEST_SKIPPED) > + return TEST_SKIPPED; > + > + RTE_TEST_ASSERT_SUCCESS(test_rc, > + "expect call_from_bpf_test(stack_copy_uint128_xfunc) to > succeed"); > + > + return TEST_SUCCESS; > +} > + > +#else > + > +static int > +test_stack_copy_uint128(void) > +{ > + return TEST_SKIPPED; > +} > + > +#endif > + > +REGISTER_FAST_TEST(bpf_stack_copy_uint128_autotest, true, true, > test_stack_copy_uint128); > + > +/* > + * Test SSE2 load and store intrinsics. > + * > + * These intrinsics are used by e.g. lib/hash. > + * > + * Test both aligned and unaligned versions. Unaligned intrinsics may still > fail > + * when the stack is misaligned, since they only treat memory address as > + * unaligned, not stack. > + */ > + > +#if defined(__SSE2__) > + > +static uint64_t > +stack_sse2_aligned_xfunc(uint64_t argument) > +{ > + /* Pass addresses through volatiles to prevent compiler from optimizing > it all > out. */ > + char alignas(16) src_buffer[16]; > + char alignas(16) dst_buffer[16]; > + void *const src = (char *volatile)src_buffer; > + void *const dst = (char *volatile)dst_buffer; > + const size_t size = 16; > + > + memset(src, 0x2a, size); > + memset(dst, 0x55, size); > + const int initial_memcmp_rc = memcmp(dst, src, size); > + > + const __m128i tmp = _mm_load_si128((const __m128i *)src); > + _mm_store_si128((__m128i *)dst, tmp); > + const int memcmp_rc = memcmp(dst, src, size); > + > + return argument + 1 + !initial_memcmp_rc + memcmp_rc; > +} > + > +static uint64_t > +stack_sse2_unaligned_xfunc(uint64_t argument) > +{ > + /* Pass addresses through volatiles to prevent compiler from optimizing > it all > out. */ > + char alignas(16) src_buffer[17]; > + char alignas(16) dst_buffer[17]; > + void *const src = (char *volatile)src_buffer + 1; > + void *const dst = (char *volatile)dst_buffer + 1; > + const size_t size = 16; > + > + memset(src, 0x2a, size); > + memset(dst, 0x55, size); > + const int initial_memcmp_rc = memcmp(dst, src, size); > + > + const __m128i tmp = _mm_loadu_si128((const __m128i *)src); > + _mm_storeu_si128((__m128i *)dst, tmp); > + const int memcmp_rc = memcmp(dst, src, size); > + > + return argument + 1 + !initial_memcmp_rc + memcmp_rc; > +} > + > +static int > +test_stack_sse2(void) > +{ > + int test_rc; > + > + test_rc = call_from_bpf_test(stack_sse2_aligned_xfunc); > + if (test_rc == TEST_SKIPPED) > + return test_rc; > + RTE_TEST_ASSERT_SUCCESS(test_rc, > + "expect call_from_bpf_test(stack_sse2_aligned_xfunc) to > succeed"); > + > + test_rc = call_from_bpf_test(stack_sse2_unaligned_xfunc); > + if (test_rc == TEST_SKIPPED) > + return test_rc; > + RTE_TEST_ASSERT_SUCCESS(test_rc, > + "expect call_from_bpf_test(stack_sse2_unaligned_xfunc) to > succeed"); > + > + return TEST_SUCCESS; > +} > + > +#else > + > +static int > +test_stack_sse2(void) > +{ > + return TEST_SKIPPED; > +} > + > +#endif > + > +REGISTER_FAST_TEST(bpf_stack_sse2_autotest, true, true, test_stack_sse2); > + > +/* > + * Run memcpy and rte_memcpy with various data sizes and offsets (unaligned > and > aligned). > + * > + * May produce false negatives even if BPF breaks stack alignment since > + * compilers may realign the stack in the beginning of the function to use > + * vector instructions with width larger than the default stack alignment. > + * However, represents very important use case that was broken in practice. > + * > + * For the reason specified above test 16-byte fixed-width memcpy explicitly. > + */ > + > +static void *volatile stack_memcpy_dst; > +static const void *volatile stack_memcpy_src; > +static size_t volatile stack_memcpy_size; > + > +static uint64_t > +stack_memcpy16_xfunc(uint64_t argument) > +{ > + RTE_ASSERT(stack_memcpy_size == 16); > + memcpy(stack_memcpy_dst, stack_memcpy_src, 16); > + return argument + 1; > +} > + > +static uint64_t > +stack_rte_memcpy16_xfunc(uint64_t argument) > +{ > + RTE_ASSERT(stack_memcpy_size == 16); > + rte_memcpy(stack_memcpy_dst, stack_memcpy_src, 16); > + return argument + 1; > +} > + > +static uint64_t > +stack_memcpy_xfunc(uint64_t argument) > +{ > + memcpy(stack_memcpy_dst, stack_memcpy_src, stack_memcpy_size); > + return argument + 1; > +} > + > +static uint64_t > +stack_rte_memcpy_xfunc(uint64_t argument) > +{ > + rte_memcpy(stack_memcpy_dst, stack_memcpy_src, stack_memcpy_size); > + return argument + 1; > +} > + > +static int > +stack_memcpy_subtest(text_xfunc_t xfunc, size_t size, size_t src_offset, > size_t > dst_offset) > +{ > + stack_memcpy_size = size; > + > + char *const src_buffer = malloc(size + src_offset); > + memset(src_buffer + src_offset, 0x2a, size); > + stack_memcpy_src = src_buffer + src_offset; > + > + char *const dst_buffer = malloc(size + dst_offset); > + memset(dst_buffer + dst_offset, 0x55, size); > + stack_memcpy_dst = dst_buffer + dst_offset; > + > + const int initial_memcmp_rc = memcmp(stack_memcpy_dst, > stack_memcpy_src, size); > + const int test_rc = call_from_bpf_test(xfunc); > + const int memcmp_rc = memcmp(stack_memcpy_dst, stack_memcpy_src, > size); > + > + free(dst_buffer); > + free(src_buffer); > + > + if (test_rc == TEST_SKIPPED) > + return TEST_SKIPPED; > + > + RTE_TEST_ASSERT_FAIL(initial_memcmp_rc, "expect memcmp() to fail > initially"); > + RTE_TEST_ASSERT_SUCCESS(test_rc, "expect call_from_bpf_test(xfunc) to > succeed"); > + RTE_TEST_ASSERT_SUCCESS(memcmp_rc, "expect memcmp() to succeed"); > + > + return TEST_SUCCESS; > +} > + > +static int > +test_stack_memcpy(void) > +{ > + for (int offsets = 0; offsets < 4; ++offsets) { > + const bool src_offset = offsets & 1; > + const bool dst_offset = offsets & 2; > + int test_rc; > + > + test_rc = stack_memcpy_subtest(stack_memcpy16_xfunc, > + 16, src_offset, dst_offset); > + if (test_rc == TEST_SKIPPED) > + return test_rc; > + RTE_TEST_ASSERT_SUCCESS(test_rc, > + "expect stack_memcpy_subtest(stack_memcpy16_xfunc, " > + "16, %i, %i) to succeed", > + src_offset, dst_offset); > + > + test_rc = stack_memcpy_subtest(stack_rte_memcpy16_xfunc, > + 16, src_offset, dst_offset); > + if (test_rc == TEST_SKIPPED) > + return test_rc; > + RTE_TEST_ASSERT_SUCCESS(test_rc, > + "expect > stack_memcpy_subtest(stack_rte_memcpy16_xfunc, " > + "16, %i, %i) to succeed", > + src_offset, dst_offset); > + > + for (size_t size = 1; size <= 1024; size <<= 1) { > + const bool src_offset = offsets & 1; > + const bool dst_offset = offsets & 2; > + int test_rc; > + > + test_rc = stack_memcpy_subtest(stack_memcpy_xfunc, > + size, src_offset, dst_offset); > + if (test_rc == TEST_SKIPPED) > + return test_rc; > + RTE_TEST_ASSERT_SUCCESS(test_rc, > + "expect > stack_memcpy_subtest(stack_memcpy_xfunc, " > + "%zu, %i, %i) to succeed", > + size, src_offset, dst_offset); > + > + test_rc = stack_memcpy_subtest(stack_rte_memcpy_xfunc, > + size, src_offset, dst_offset); > + if (test_rc == TEST_SKIPPED) > + return test_rc; > + RTE_TEST_ASSERT_SUCCESS(test_rc, > + "expect > stack_memcpy_subtest(stack_rte_memcpy_xfunc, " > + "%zu, %i, %i) to succeed", > + size, src_offset, dst_offset); > + } > + } > + return TEST_SUCCESS; > +} > + > +REGISTER_FAST_TEST(bpf_stack_memcpy_autotest, true, true, > test_stack_memcpy); > + > #ifdef TEST_BPF_ELF_LOAD > > /* > diff --git a/lib/bpf/bpf_jit_x86.c b/lib/bpf/bpf_jit_x86.c > index 4d74e418f8..09865acabb 100644 > --- a/lib/bpf/bpf_jit_x86.c > +++ b/lib/bpf/bpf_jit_x86.c > @@ -93,6 +93,8 @@ enum { > /* > * callee saved registers list. > * keep RBP as the last one. > + * RBP is marked as used every time we have external calls > + * since we need it to save RSP before stack realignment. > */ > static const uint32_t save_regs[] = {RBX, R12, R13, R14, R15, RBP}; > > @@ -685,6 +687,8 @@ emit_call(struct bpf_jit_state *st, uintptr_t trg) > const uint8_t ops = 0xFF; > const uint8_t mods = 2; > > + /* Mark RBP as used to trigger stack realignment in prolog. */ > + USED(st->reguse, RBP); > emit_ld_imm64(st, RAX, trg, trg >> 32); > emit_bytes(st, &ops, sizeof(ops)); > emit_modregrm(st, MOD_DIRECT, mods, RAX); > @@ -1204,7 +1208,6 @@ emit_prolog(struct bpf_jit_state *st, int32_t > stack_size) > if (spil == 0) > return; > > - > emit_alu_imm(st, EBPF_ALU64 | BPF_SUB | BPF_K, RSP, > spil * sizeof(uint64_t)); > > @@ -1220,6 +1223,16 @@ emit_prolog(struct bpf_jit_state *st, int32_t > stack_size) > if (INUSE(st->reguse, RBP) != 0) { > emit_mov_reg(st, EBPF_ALU64 | EBPF_MOV | BPF_X, RSP, RBP); > emit_alu_imm(st, EBPF_ALU64 | BPF_SUB | BPF_K, RSP, stack_size); > + > + /* > + * Align stack pointer appropriately for function calls. > + * We do not have direct access to function stack alignment > + * value (like gcc internal macro INCOMING_STACK_BOUNDARY), > + * but hopefully `alignof(max_align_t)` will always be greater. > + * Original stack pointer will be restored from rbp. > + */ > + emit_alu_imm(st, EBPF_ALU64 | BPF_AND | BPF_K, RSP, > + -(uint32_t)alignof(max_align_t)); > } > } > > -- > 2.43.0

