https://gcc.gnu.org/bugzilla/show_bug.cgi?id=88096

            Bug ID: 88096
           Summary: wrong inline AVX512F optimization
           Product: gcc
           Version: 7.0
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: rtl-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: thomas at monjalon dot net
  Target Milestone: ---

In the project DPDK, a bug is found with a special sequence of
inlined intrinsics code when AVX512F optimization is enabled.

Summary of the issue:
        - CPU: Intel Skylake
        - Linux environment: Ubuntu 18.04
        - Compiler: GCC 7 or 8 (7.3.0-27ubuntu1~18.04 or 8.2.0-1ubuntu2~18.04)
        - Compiler optimizations: -march=native, -O1 and higher
        - Scenario: testpmd application crashes when it starts forwarding
        - Behaviour: AVX2 version of rte_memcpy() fails if optimized for AVX512
        - Context: several nested inline functions
        - Workaround: disable AVX512 optimization with -mno-avx512f

The URL of the bug report in DPDK project is
        https://bugs.dpdk.org/show_bug.cgi?id=97

Steps to reproduce:
        - run Ubuntu 18.04 on Skylake CPU
                CPU check: grep -m1 avx512 /proc/cpuinfo
        - compile DPDK mlx5 PMD
                sudo apt-get install rdma-core
                git clone -b v18.11-rc1 http://dpdk.org/git/dpdk
                cd dpdk
                make defconfig
                sed -ri 's,(MLX5_PMD=)n,\1y,' build/.config
                sed -ri 's,(KMOD=)y,\1n,' build/.config
                sed -ri 's,(UIO=)y,\1n,' build/.config
                make -j EXTRA_CFLAGS=-ggdb
        - match bad instruction pattern
                gdb -batch -ex 'file build/app/testpmd' -ex 'set
disassembly-flavor intel' \
                -ex 'disassemble/rs mlx5_tx_burst' | grep 'vmovdqu.\?
.*\*8+0x[2-6]\]'

AVX512F is disabled in dpdk-18.11-rc2:
        http://git.dpdk.org/dpdk/commit/?id=8d07c82b

The DPDK code can be browsed at
       
http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcpy.h#n305

Call stack of the inline functions:

        mlx5_tx_complete
          rte_mempool_put_bulk
            rte_mempool_generic_put
              __mempool_generic_put
                rte_memcpy
                  rte_memcpy_generic
                    rte_mov128
                      rte_mov32
                        _mm256_loadu_si256

Code of the low-level functions:

        static __rte_always_inline void
        rte_mov32(uint8_t *dst, const uint8_t *src)
        {
                __m256i ymm0;
                ymm0 = _mm256_loadu_si256((const __m256i *)src);
                _mm256_storeu_si256((__m256i *)dst, ymm0);
        }
        static inline void
        rte_mov128(uint8_t *dst, const uint8_t *src)
        {
                rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 *
32);
                rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 *
32);
                rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 *
32);
                rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 *
32);
        }

The bug appears in all calls of the inline function mlx5_tx_complete().
When disabling AVX512F, we see a different memory offset:

--- bad-rte_mov128-avx512-enabled
+++ good-rte_mov128-avx512-disabled
-       vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0]
+       vmovdqu xmm0,XMMWORD PTR [rax*8+0x0]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1
        vmovups XMMWORD PTR [rsi],xmm0
        vextracti128 XMMWORD PTR [rsi+0x10],ymm0,0x1
-       vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2]
+       vmovdqu xmm0,XMMWORD PTR [rax*8+0x20]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1
        vmovups XMMWORD PTR [rsi+0x20],xmm0
        vextracti128 XMMWORD PTR [rsi+0x30],ymm0,0x1
-       vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4]
+       vmovdqu xmm0,XMMWORD PTR [rax*8+0x40]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1
        vmovups XMMWORD PTR [rsi+0x40],xmm0
        vextracti128 XMMWORD PTR [rsi+0x50],ymm0,0x1
-       vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
+       vmovdqu xmm0,XMMWORD PTR [rax*8+0x60]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1
        vmovups XMMWORD PTR [rsi+0x60],xmm0
        vextracti128 XMMWORD PTR [rsi+0x70],ymm0,0x1

The result is corrupting copied data with rte_memcpy()
from:
    0x0000000109c43e00    0x0000000109c434c0
    0x0000000109c42b80    0x0000000109c42240
    0x0000000109c41900    0x0000000109c40fc0
    0x0000000109c40680    0x0000000109c3fd40
    0x0000000109c3f400    0x0000000109c3eac0
    0x0000000109c3e180    0x0000000109c3d840
    0x0000000109c3cf00    0x0000000109c3c5c0
    0x0000000109c3bc80    0x0000000109c3b340
to:
    0x0000000109c43e00    0x0000000109c434c0
    0x0000000109c42b80    0x0000000109c42240
    0x34c00000000109c4    0x2b800000000109c4
    0x0000000109c40680    0x0000000109c3fd40
    0x09c434c000000001    0x09c42b8000000001
    0x0000000109c3e180    0x0000000109c3d840
    0x000109c434c00000    0x000109c42b800000
    0x0000000109c3bc80    0x0000000109c3b340

If needed to do further analysis, we can isolate the calls to rte_mov128()
with this patch:

--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -330,13 +330,19 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
  * Copy 128 bytes from one location to another,
  * locations should not overlap.
  */
+#include <rte_atomic.h>
+static volatile int dpdk_bug97_marker __attribute__((used));
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
+       dpdk_bug97_marker = 0xdbdb97be; /* sequence begins */
+       rte_mb();
        rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
        rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
        rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
        rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
+       rte_mb();
+       dpdk_bug97_marker = 0xdbdb97ed; /* sequence ends */
 }

The disassembled sequence can be found with this kind of sed command:

        gdb -batch -ex 'file build/app/testpmd' -ex 'set disassembly-flavor
intel' \
                -ex 'disassemble/rs mlx5_tx_burst' |
        sed -rn 's,.*0x00.*:[[:space:]]*([0-9a-f][0-9a-f][[:space:]])*,,p' |
        sed '/0xdbdb97be/,/0xdbdb97ed/!d' | sed '/0xdbdb97ed/s,$,\n---,'

Note: mlx5_tx_burst() is one example of buggy function,
because calling mlx5_tx_complete(), which is the top of the call stack above.

You can find below all four inline calls to rte_mov128() from mlx5_tx_burst().
Only the third call (from mlx5_tx_complete()) has the offsets bug.

        mov    DWORD PTR [rip+0x937b56],0xdbdb97be        # 0xde0db0
<dpdk_bug97_marker>
        lea    r13,[rdx+0x80]
        mov    QWORD PTR [rbp-0xe0],r8
        lea    r8,[rcx+0x80]
        mfence 
        vmovdqu8 xmm0,XMMWORD PTR [rdx]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x10],0x1
        vmovups XMMWORD PTR [rcx],xmm0
        vextracti128 XMMWORD PTR [rcx+0x10],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rdx+0x20]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1
        vmovups XMMWORD PTR [rcx+0x20],xmm0
        vextracti128 XMMWORD PTR [rcx+0x30],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1
        vmovups XMMWORD PTR [rcx+0x40],xmm0
        vextracti128 XMMWORD PTR [rcx+0x50],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1
        vmovups XMMWORD PTR [rcx+0x60],xmm0
        vextracti128 XMMWORD PTR [rcx+0x70],ymm0,0x1
        mfence 
        mov    DWORD PTR [rip+0x937acb],0xdbdb97ed        # 0xde0db0
<dpdk_bug97_marker>
        ---
        mov    DWORD PTR [rip+0x937912],0xdbdb97be        # 0xde0db0
<dpdk_bug97_marker>
        lea    rsi,[rax-0x80]
        lea    r12,[rdx+0x80]
        mfence 
        vmovdqu8 xmm0,XMMWORD PTR [rdx]
        mov    rdi,QWORD PTR [rbp-0x60]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x10],0x1
        lea    rcx,[rdi+0xb0]
        vmovups XMMWORD PTR [rdi+0x30],xmm0
        vextracti128 XMMWORD PTR [rdi+0x40],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rdx+0x20]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1
        vmovups XMMWORD PTR [rdi+0x50],xmm0
        vextracti128 XMMWORD PTR [rdi+0x60],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1
        vmovups XMMWORD PTR [rdi+0x70],xmm0
        vextracti128 XMMWORD PTR [rdi+0x80],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1
        vmovups XMMWORD PTR [rdi+0x50],xmm0
        vextracti128 XMMWORD PTR [rdi+0x60],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1
        vmovups XMMWORD PTR [rdi+0x70],xmm0
        vextracti128 XMMWORD PTR [rdi+0x80],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1
        vmovups XMMWORD PTR [rdi+0x90],xmm0
        vextracti128 XMMWORD PTR [rdi+0xa0],ymm0,0x1
        mfence 
        mov    DWORD PTR [rip+0x93787c],0xdbdb97ed        # 0xde0db0
<dpdk_bug97_marker>
        ---
        mov    DWORD PTR [rip+0x936fd2],0xdbdb97be        # 0xde0db0
<dpdk_bug97_marker>
        lea    rcx,[rax-0x80]
        sub    rdx,0xffffffffffffff80
        mfence 
        mov    rax,QWORD PTR [rbp-0xa0]
        vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1
        vmovups XMMWORD PTR [rdx-0x80],xmm0
        vextracti128 XMMWORD PTR [rdx-0x70],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1
        vmovups XMMWORD PTR [rdx-0x60],xmm0
        vextracti128 XMMWORD PTR [rdx-0x50],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1
        vmovups XMMWORD PTR [rdx-0x40],xmm0
        vextracti128 XMMWORD PTR [rdx-0x30],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
        vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1
        lea    rax,[r9+0x80]
        vmovups XMMWORD PTR [rdx-0x20],xmm0
        vextracti128 XMMWORD PTR [rdx-0x10],ymm0,0x1
        mfence 
        mov    DWORD PTR [rip+0x936f24],0xdbdb97ed        # 0xde0db0
<dpdk_bug97_marker>
        ---
        mov    DWORD PTR [rip+0x936e2f],0xdbdb97be        # 0xde0db0
<dpdk_bug97_marker>
        lea    rdi,[r9+0x80]
        add    rax,0xffffffffffffff80
        mfence 
        vmovdqu8 xmm0,XMMWORD PTR [r9]
        sub    rdx,0xffffffffffffff80
        vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x10],0x1
        vmovups XMMWORD PTR [rdx-0x80],xmm0
        vextracti128 XMMWORD PTR [rdx-0x70],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [r9+0x20]
        vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x30],0x1
        vmovups XMMWORD PTR [rdx-0x60],xmm0
        vextracti128 XMMWORD PTR [rdx-0x50],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [r9+0x40]
        vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x50],0x1
        vmovups XMMWORD PTR [rdx-0x40],xmm0
        vextracti128 XMMWORD PTR [rdx-0x30],ymm0,0x1
        vmovdqu8 xmm0,XMMWORD PTR [r9+0x60]
        vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x70],0x1
        vmovups XMMWORD PTR [rdx-0x20],xmm0
        vextracti128 XMMWORD PTR [rdx-0x10],ymm0,0x1
        mfence 
        mov    DWORD PTR [rip+0x936da9],0xdbdb97ed        # 0xde0db0
<dpdk_bug97_marker>

Reply via email to