https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88096
Bug ID: 88096 Summary: wrong inline AVX512F optimization Product: gcc Version: 7.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: rtl-optimization Assignee: unassigned at gcc dot gnu.org Reporter: thomas at monjalon dot net Target Milestone: --- In the project DPDK, a bug is found with a special sequence of inlined intrinsics code when AVX512F optimization is enabled. Summary of the issue: - CPU: Intel Skylake - Linux environment: Ubuntu 18.04 - Compiler: GCC 7 or 8 (7.3.0-27ubuntu1~18.04 or 8.2.0-1ubuntu2~18.04) - Compiler optimizations: -march=native, -O1 and higher - Scenario: testpmd application crashes when it starts forwarding - Behaviour: AVX2 version of rte_memcpy() fails if optimized for AVX512 - Context: several nested inline functions - Workaround: disable AVX512 optimization with -mno-avx512f The URL of the bug report in DPDK project is https://bugs.dpdk.org/show_bug.cgi?id=97 Steps to reproduce: - run Ubuntu 18.04 on Skylake CPU CPU check: grep -m1 avx512 /proc/cpuinfo - compile DPDK mlx5 PMD sudo apt-get install rdma-core git clone -b v18.11-rc1 http://dpdk.org/git/dpdk cd dpdk make defconfig sed -ri 's,(MLX5_PMD=)n,\1y,' build/.config sed -ri 's,(KMOD=)y,\1n,' build/.config sed -ri 's,(UIO=)y,\1n,' build/.config make -j EXTRA_CFLAGS=-ggdb - match bad instruction pattern gdb -batch -ex 'file build/app/testpmd' -ex 'set disassembly-flavor intel' \ -ex 'disassemble/rs mlx5_tx_burst' | grep 'vmovdqu.\? .*\*8+0x[2-6]\]' AVX512F is disabled in dpdk-18.11-rc2: http://git.dpdk.org/dpdk/commit/?id=8d07c82b The DPDK code can be browsed at http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcpy.h#n305 Call stack of the inline functions: mlx5_tx_complete rte_mempool_put_bulk rte_mempool_generic_put __mempool_generic_put rte_memcpy rte_memcpy_generic rte_mov128 rte_mov32 _mm256_loadu_si256 Code of the low-level functions: static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); } static inline void rte_mov128(uint8_t *dst, const uint8_t *src) { rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); } The bug appears in all calls of the inline function mlx5_tx_complete(). When disabling AVX512F, we see a different memory offset: --- bad-rte_mov128-avx512-enabled +++ good-rte_mov128-avx512-disabled - vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0] + vmovdqu xmm0,XMMWORD PTR [rax*8+0x0] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1 vmovups XMMWORD PTR [rsi],xmm0 vextracti128 XMMWORD PTR [rsi+0x10],ymm0,0x1 - vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2] + vmovdqu xmm0,XMMWORD PTR [rax*8+0x20] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1 vmovups XMMWORD PTR [rsi+0x20],xmm0 vextracti128 XMMWORD PTR [rsi+0x30],ymm0,0x1 - vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4] + vmovdqu xmm0,XMMWORD PTR [rax*8+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1 vmovups XMMWORD PTR [rsi+0x40],xmm0 vextracti128 XMMWORD PTR [rsi+0x50],ymm0,0x1 - vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6] + vmovdqu xmm0,XMMWORD PTR [rax*8+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1 vmovups XMMWORD PTR [rsi+0x60],xmm0 vextracti128 XMMWORD PTR [rsi+0x70],ymm0,0x1 The result is corrupting copied data with rte_memcpy() from: 0x0000000109c43e00 0x0000000109c434c0 0x0000000109c42b80 0x0000000109c42240 0x0000000109c41900 0x0000000109c40fc0 0x0000000109c40680 0x0000000109c3fd40 0x0000000109c3f400 0x0000000109c3eac0 0x0000000109c3e180 0x0000000109c3d840 0x0000000109c3cf00 0x0000000109c3c5c0 0x0000000109c3bc80 0x0000000109c3b340 to: 0x0000000109c43e00 0x0000000109c434c0 0x0000000109c42b80 0x0000000109c42240 0x34c00000000109c4 0x2b800000000109c4 0x0000000109c40680 0x0000000109c3fd40 0x09c434c000000001 0x09c42b8000000001 0x0000000109c3e180 0x0000000109c3d840 0x000109c434c00000 0x000109c42b800000 0x0000000109c3bc80 0x0000000109c3b340 If needed to do further analysis, we can isolate the calls to rte_mov128() with this patch: --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -330,13 +330,19 @@ rte_mov64(uint8_t *dst, const uint8_t *src) * Copy 128 bytes from one location to another, * locations should not overlap. */ +#include <rte_atomic.h> +static volatile int dpdk_bug97_marker __attribute__((used)); static inline void rte_mov128(uint8_t *dst, const uint8_t *src) { + dpdk_bug97_marker = 0xdbdb97be; /* sequence begins */ + rte_mb(); rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); + rte_mb(); + dpdk_bug97_marker = 0xdbdb97ed; /* sequence ends */ } The disassembled sequence can be found with this kind of sed command: gdb -batch -ex 'file build/app/testpmd' -ex 'set disassembly-flavor intel' \ -ex 'disassemble/rs mlx5_tx_burst' | sed -rn 's,.*0x00.*:[[:space:]]*([0-9a-f][0-9a-f][[:space:]])*,,p' | sed '/0xdbdb97be/,/0xdbdb97ed/!d' | sed '/0xdbdb97ed/s,$,\n---,' Note: mlx5_tx_burst() is one example of buggy function, because calling mlx5_tx_complete(), which is the top of the call stack above. You can find below all four inline calls to rte_mov128() from mlx5_tx_burst(). Only the third call (from mlx5_tx_complete()) has the offsets bug. mov DWORD PTR [rip+0x937b56],0xdbdb97be # 0xde0db0 <dpdk_bug97_marker> lea r13,[rdx+0x80] mov QWORD PTR [rbp-0xe0],r8 lea r8,[rcx+0x80] mfence vmovdqu8 xmm0,XMMWORD PTR [rdx] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x10],0x1 vmovups XMMWORD PTR [rcx],xmm0 vextracti128 XMMWORD PTR [rcx+0x10],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x20] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1 vmovups XMMWORD PTR [rcx+0x20],xmm0 vextracti128 XMMWORD PTR [rcx+0x30],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1 vmovups XMMWORD PTR [rcx+0x40],xmm0 vextracti128 XMMWORD PTR [rcx+0x50],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1 vmovups XMMWORD PTR [rcx+0x60],xmm0 vextracti128 XMMWORD PTR [rcx+0x70],ymm0,0x1 mfence mov DWORD PTR [rip+0x937acb],0xdbdb97ed # 0xde0db0 <dpdk_bug97_marker> --- mov DWORD PTR [rip+0x937912],0xdbdb97be # 0xde0db0 <dpdk_bug97_marker> lea rsi,[rax-0x80] lea r12,[rdx+0x80] mfence vmovdqu8 xmm0,XMMWORD PTR [rdx] mov rdi,QWORD PTR [rbp-0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x10],0x1 lea rcx,[rdi+0xb0] vmovups XMMWORD PTR [rdi+0x30],xmm0 vextracti128 XMMWORD PTR [rdi+0x40],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x20] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1 vmovups XMMWORD PTR [rdi+0x50],xmm0 vextracti128 XMMWORD PTR [rdi+0x60],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1 vmovups XMMWORD PTR [rdi+0x70],xmm0 vextracti128 XMMWORD PTR [rdi+0x80],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1 vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1 vmovups XMMWORD PTR [rdi+0x50],xmm0 vextracti128 XMMWORD PTR [rdi+0x60],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1 vmovups XMMWORD PTR [rdi+0x70],xmm0 vextracti128 XMMWORD PTR [rdi+0x80],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1 vmovups XMMWORD PTR [rdi+0x90],xmm0 vextracti128 XMMWORD PTR [rdi+0xa0],ymm0,0x1 mfence mov DWORD PTR [rip+0x93787c],0xdbdb97ed # 0xde0db0 <dpdk_bug97_marker> --- mov DWORD PTR [rip+0x936fd2],0xdbdb97be # 0xde0db0 <dpdk_bug97_marker> lea rcx,[rax-0x80] sub rdx,0xffffffffffffff80 mfence mov rax,QWORD PTR [rbp-0xa0] vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1 vmovups XMMWORD PTR [rdx-0x80],xmm0 vextracti128 XMMWORD PTR [rdx-0x70],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1 vmovups XMMWORD PTR [rdx-0x60],xmm0 vextracti128 XMMWORD PTR [rdx-0x50],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1 vmovups XMMWORD PTR [rdx-0x40],xmm0 vextracti128 XMMWORD PTR [rdx-0x30],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1 lea rax,[r9+0x80] vmovups XMMWORD PTR [rdx-0x20],xmm0 vextracti128 XMMWORD PTR [rdx-0x10],ymm0,0x1 mfence mov DWORD PTR [rip+0x936f24],0xdbdb97ed # 0xde0db0 <dpdk_bug97_marker> --- mov DWORD PTR [rip+0x936e2f],0xdbdb97be # 0xde0db0 <dpdk_bug97_marker> lea rdi,[r9+0x80] add rax,0xffffffffffffff80 mfence vmovdqu8 xmm0,XMMWORD PTR [r9] sub rdx,0xffffffffffffff80 vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x10],0x1 vmovups XMMWORD PTR [rdx-0x80],xmm0 vextracti128 XMMWORD PTR [rdx-0x70],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [r9+0x20] vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x30],0x1 vmovups XMMWORD PTR [rdx-0x60],xmm0 vextracti128 XMMWORD PTR [rdx-0x50],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [r9+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x50],0x1 vmovups XMMWORD PTR [rdx-0x40],xmm0 vextracti128 XMMWORD PTR [rdx-0x30],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [r9+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x70],0x1 vmovups XMMWORD PTR [rdx-0x20],xmm0 vextracti128 XMMWORD PTR [rdx-0x10],ymm0,0x1 mfence mov DWORD PTR [rip+0x936da9],0xdbdb97ed # 0xde0db0 <dpdk_bug97_marker>