Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics
On Mon, Feb 26, 2024 at 6:30 PM H.J. Lu wrote: > > On Sun, Feb 25, 2024 at 8:25 PM H.J. Lu wrote: > > > > On Sun, Feb 25, 2024 at 7:03 PM Hongtao Liu wrote: > > > > > > On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu wrote: > > > > > > > > On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu wrote: > > > > > > > > > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu wrote: > > > > > > > > > > > > ldtilecfg and sttilecfg take a 512-byte memory block. With > > > > > > _tile_loadconfig implemented as > > > > > > > > > > > > extern __inline void > > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > > > > _tile_loadconfig (const void *__config) > > > > > > { > > > > > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void > > > > > > **)__config))); > > > > > > } > > > > > > > > > > > > GCC sees: > > > > > > > > > > > > (parallel [ > > > > > > (asm_operands/v ("ldtilecfg %X0") ("") 0 > > > > > >[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars) > > > > > > (const_int -64 [0xffc0])) [1 > > > > > > MEM[(const void * *)&tile_data]+0 S8 A128])] > > > > > >[(asm_input:DI ("m"))] > > > > > >(clobber (reg:CC 17 flags))]) > > > > > > > > > > > > and the memory operand size is 1 byte. As the result, the rest of > > > > > > 511 > > > > > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg > > > > > > intrinsics > > > > > > with a pointer to BLKmode to honor the 512-byte memory block. > > > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > > > PR target/114098 > > > > > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use > > > > > > __builtin_ia32_ldtilecfg. > > > > > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg. > > > > > > * config/i386/i386-builtin.def (BDESC): Add > > > > > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg. > > > > > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle > > > > > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG. > > > > > > * config/i386/i386.md (ldtilecfg): New pattern. > > > > > > (sttilecfg): Likewise. > > > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > > > PR target/114098 > > > > > > * gcc.target/i386/amxtile-4.c: New test. > > > > > > --- > > > > > > gcc/config/i386/amxtileintrin.h | 4 +- > > > > > > gcc/config/i386/i386-builtin.def | 4 ++ > > > > > > gcc/config/i386/i386-expand.cc| 19 > > > > > > gcc/config/i386/i386.md | 24 ++ > > > > > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 > > > > > > +++ > > > > > > 5 files changed, 104 insertions(+), 2 deletions(-) > > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c > > > > > > > > > > > > diff --git a/gcc/config/i386/amxtileintrin.h > > > > > > b/gcc/config/i386/amxtileintrin.h > > > > > > index d1a26e0fea5..5081b326498 100644 > > > > > > --- a/gcc/config/i386/amxtileintrin.h > > > > > > +++ b/gcc/config/i386/amxtileintrin.h > > > > > > @@ -39,14 +39,14 @@ extern __inline void > > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > > > > _tile_loadconfig (const void *__config) > > > > > > { > > > > > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void > > > > > > **)__config))); > > > > > > + __builtin_ia32_ldtilecfg (__config); > > > > > > } > > > > > > > > > > > > extern __inline void > > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > > > > _tile_storeconfig (void *__config) > > > > > > { > > > > > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void > > > > > > **)__config))); > > > > > > + __builtin_ia32_sttilecfg (__config); > > > > > > } > > > > > > > > > > > > extern __inline void > > > > > > diff --git a/gcc/config/i386/i386-builtin.def > > > > > > b/gcc/config/i386/i386-builtin.def > > > > > > index 729355230b8..88dd7f8857f 100644 > > > > > > --- a/gcc/config/i386/i386-builtin.def > > > > > > +++ b/gcc/config/i386/i386-builtin.def > > > > > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | > > > > > > OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b > > > > > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, > > > > > > CODE_FOR_nothing, "__builtin_ia32_xrstors64", > > > > > > IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > > > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, > > > > > > CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, > > > > > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > > > > > > > > > > +/* LDFILECFG and STFILECFG. */ > > > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > > > > > CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", > > > > > > IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID) > > > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > >
Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics
On Sun, Feb 25, 2024 at 8:25 PM H.J. Lu wrote: > > On Sun, Feb 25, 2024 at 7:03 PM Hongtao Liu wrote: > > > > On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu wrote: > > > > > > On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu wrote: > > > > > > > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu wrote: > > > > > > > > > > ldtilecfg and sttilecfg take a 512-byte memory block. With > > > > > _tile_loadconfig implemented as > > > > > > > > > > extern __inline void > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > > > _tile_loadconfig (const void *__config) > > > > > { > > > > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void > > > > > **)__config))); > > > > > } > > > > > > > > > > GCC sees: > > > > > > > > > > (parallel [ > > > > > (asm_operands/v ("ldtilecfg %X0") ("") 0 > > > > >[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars) > > > > > (const_int -64 [0xffc0])) [1 > > > > > MEM[(const void * *)&tile_data]+0 S8 A128])] > > > > >[(asm_input:DI ("m"))] > > > > >(clobber (reg:CC 17 flags))]) > > > > > > > > > > and the memory operand size is 1 byte. As the result, the rest of 511 > > > > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics > > > > > with a pointer to BLKmode to honor the 512-byte memory block. > > > > > > > > > > gcc/ChangeLog: > > > > > > > > > > PR target/114098 > > > > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use > > > > > __builtin_ia32_ldtilecfg. > > > > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg. > > > > > * config/i386/i386-builtin.def (BDESC): Add > > > > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg. > > > > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle > > > > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG. > > > > > * config/i386/i386.md (ldtilecfg): New pattern. > > > > > (sttilecfg): Likewise. > > > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > > > PR target/114098 > > > > > * gcc.target/i386/amxtile-4.c: New test. > > > > > --- > > > > > gcc/config/i386/amxtileintrin.h | 4 +- > > > > > gcc/config/i386/i386-builtin.def | 4 ++ > > > > > gcc/config/i386/i386-expand.cc| 19 > > > > > gcc/config/i386/i386.md | 24 ++ > > > > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 > > > > > +++ > > > > > 5 files changed, 104 insertions(+), 2 deletions(-) > > > > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c > > > > > > > > > > diff --git a/gcc/config/i386/amxtileintrin.h > > > > > b/gcc/config/i386/amxtileintrin.h > > > > > index d1a26e0fea5..5081b326498 100644 > > > > > --- a/gcc/config/i386/amxtileintrin.h > > > > > +++ b/gcc/config/i386/amxtileintrin.h > > > > > @@ -39,14 +39,14 @@ extern __inline void > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > > > _tile_loadconfig (const void *__config) > > > > > { > > > > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void > > > > > **)__config))); > > > > > + __builtin_ia32_ldtilecfg (__config); > > > > > } > > > > > > > > > > extern __inline void > > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > > > _tile_storeconfig (void *__config) > > > > > { > > > > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); > > > > > + __builtin_ia32_sttilecfg (__config); > > > > > } > > > > > > > > > > extern __inline void > > > > > diff --git a/gcc/config/i386/i386-builtin.def > > > > > b/gcc/config/i386/i386-builtin.def > > > > > index 729355230b8..88dd7f8857f 100644 > > > > > --- a/gcc/config/i386/i386-builtin.def > > > > > +++ b/gcc/config/i386/i386-builtin.def > > > > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | > > > > > OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b > > > > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, > > > > > CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, > > > > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, > > > > > CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, > > > > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > > > > > > > > +/* LDFILECFG and STFILECFG. */ > > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > > > > CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", > > > > > IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID) > > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > > > > CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", > > > > > IX86_BUILTIN_STTILECFG, UNKNOWN, (int) VOID_FTYPE_PVOID) > > > > CODE_FOR_sttilecfg. > > > > > > It is unused. I changed both to CODE_FOR_nothing. > > > > > > > > + > > > > > /* SSE */ > > > > > BDESC (OPTION_MASK_ISA_SSE, 0, COD
Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics
On Sun, Feb 25, 2024 at 7:03 PM Hongtao Liu wrote: > > On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu wrote: > > > > On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu wrote: > > > > > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu wrote: > > > > > > > > ldtilecfg and sttilecfg take a 512-byte memory block. With > > > > _tile_loadconfig implemented as > > > > > > > > extern __inline void > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > > _tile_loadconfig (const void *__config) > > > > { > > > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void > > > > **)__config))); > > > > } > > > > > > > > GCC sees: > > > > > > > > (parallel [ > > > > (asm_operands/v ("ldtilecfg %X0") ("") 0 > > > >[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars) > > > > (const_int -64 [0xffc0])) [1 > > > > MEM[(const void * *)&tile_data]+0 S8 A128])] > > > >[(asm_input:DI ("m"))] > > > >(clobber (reg:CC 17 flags))]) > > > > > > > > and the memory operand size is 1 byte. As the result, the rest of 511 > > > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics > > > > with a pointer to BLKmode to honor the 512-byte memory block. > > > > > > > > gcc/ChangeLog: > > > > > > > > PR target/114098 > > > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use > > > > __builtin_ia32_ldtilecfg. > > > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg. > > > > * config/i386/i386-builtin.def (BDESC): Add > > > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg. > > > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle > > > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG. > > > > * config/i386/i386.md (ldtilecfg): New pattern. > > > > (sttilecfg): Likewise. > > > > > > > > gcc/testsuite/ChangeLog: > > > > > > > > PR target/114098 > > > > * gcc.target/i386/amxtile-4.c: New test. > > > > --- > > > > gcc/config/i386/amxtileintrin.h | 4 +- > > > > gcc/config/i386/i386-builtin.def | 4 ++ > > > > gcc/config/i386/i386-expand.cc| 19 > > > > gcc/config/i386/i386.md | 24 ++ > > > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++ > > > > 5 files changed, 104 insertions(+), 2 deletions(-) > > > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c > > > > > > > > diff --git a/gcc/config/i386/amxtileintrin.h > > > > b/gcc/config/i386/amxtileintrin.h > > > > index d1a26e0fea5..5081b326498 100644 > > > > --- a/gcc/config/i386/amxtileintrin.h > > > > +++ b/gcc/config/i386/amxtileintrin.h > > > > @@ -39,14 +39,14 @@ extern __inline void > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > > _tile_loadconfig (const void *__config) > > > > { > > > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void > > > > **)__config))); > > > > + __builtin_ia32_ldtilecfg (__config); > > > > } > > > > > > > > extern __inline void > > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > > _tile_storeconfig (void *__config) > > > > { > > > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); > > > > + __builtin_ia32_sttilecfg (__config); > > > > } > > > > > > > > extern __inline void > > > > diff --git a/gcc/config/i386/i386-builtin.def > > > > b/gcc/config/i386/i386-builtin.def > > > > index 729355230b8..88dd7f8857f 100644 > > > > --- a/gcc/config/i386/i386-builtin.def > > > > +++ b/gcc/config/i386/i386-builtin.def > > > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | > > > > OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b > > > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, > > > > CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, > > > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, > > > > CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, > > > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > > > > > > +/* LDFILECFG and STFILECFG. */ > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > > > CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, > > > > UNKNOWN, (int) VOID_FTYPE_PCVOID) > > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > > > CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, > > > > UNKNOWN, (int) VOID_FTYPE_PVOID) > > > CODE_FOR_sttilecfg. > > > > It is unused. I changed both to CODE_FOR_nothing. > > > > > > + > > > > /* SSE */ > > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, > > > > "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) > > > > VOID_FTYPE_PFLOAT_V4SF) > > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, > > > > "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) > > > > VOID_
Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics
On Mon, Feb 26, 2024 at 10:37 AM H.J. Lu wrote: > > On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu wrote: > > > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu wrote: > > > > > > ldtilecfg and sttilecfg take a 512-byte memory block. With > > > _tile_loadconfig implemented as > > > > > > extern __inline void > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > _tile_loadconfig (const void *__config) > > > { > > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); > > > } > > > > > > GCC sees: > > > > > > (parallel [ > > > (asm_operands/v ("ldtilecfg %X0") ("") 0 > > >[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars) > > > (const_int -64 [0xffc0])) [1 > > > MEM[(const void * *)&tile_data]+0 S8 A128])] > > >[(asm_input:DI ("m"))] > > >(clobber (reg:CC 17 flags))]) > > > > > > and the memory operand size is 1 byte. As the result, the rest of 511 > > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics > > > with a pointer to BLKmode to honor the 512-byte memory block. > > > > > > gcc/ChangeLog: > > > > > > PR target/114098 > > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use > > > __builtin_ia32_ldtilecfg. > > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg. > > > * config/i386/i386-builtin.def (BDESC): Add > > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg. > > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle > > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG. > > > * config/i386/i386.md (ldtilecfg): New pattern. > > > (sttilecfg): Likewise. > > > > > > gcc/testsuite/ChangeLog: > > > > > > PR target/114098 > > > * gcc.target/i386/amxtile-4.c: New test. > > > --- > > > gcc/config/i386/amxtileintrin.h | 4 +- > > > gcc/config/i386/i386-builtin.def | 4 ++ > > > gcc/config/i386/i386-expand.cc| 19 > > > gcc/config/i386/i386.md | 24 ++ > > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++ > > > 5 files changed, 104 insertions(+), 2 deletions(-) > > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c > > > > > > diff --git a/gcc/config/i386/amxtileintrin.h > > > b/gcc/config/i386/amxtileintrin.h > > > index d1a26e0fea5..5081b326498 100644 > > > --- a/gcc/config/i386/amxtileintrin.h > > > +++ b/gcc/config/i386/amxtileintrin.h > > > @@ -39,14 +39,14 @@ extern __inline void > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > _tile_loadconfig (const void *__config) > > > { > > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void > > > **)__config))); > > > + __builtin_ia32_ldtilecfg (__config); > > > } > > > > > > extern __inline void > > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > > _tile_storeconfig (void *__config) > > > { > > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); > > > + __builtin_ia32_sttilecfg (__config); > > > } > > > > > > extern __inline void > > > diff --git a/gcc/config/i386/i386-builtin.def > > > b/gcc/config/i386/i386-builtin.def > > > index 729355230b8..88dd7f8857f 100644 > > > --- a/gcc/config/i386/i386-builtin.def > > > +++ b/gcc/config/i386/i386-builtin.def > > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | > > > OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b > > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, > > > CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, > > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, > > > CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, > > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > > > > +/* LDFILECFG and STFILECFG. */ > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > > CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, > > > UNKNOWN, (int) VOID_FTYPE_PCVOID) > > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > > CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, > > > UNKNOWN, (int) VOID_FTYPE_PVOID) > > CODE_FOR_sttilecfg. > > It is unused. I changed both to CODE_FOR_nothing. > > > > + > > > /* SSE */ > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, > > > "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) > > > VOID_FTYPE_PFLOAT_V4SF) > > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, > > > "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) > > > VOID_FTYPE_PFLOAT_V4SF) > > > diff --git a/gcc/config/i386/i386-expand.cc > > > b/gcc/config/i386/i386-expand.cc > > > index a4d3369f01b..17993eb837f 100644 > > > --- a/gcc/config/i386/i386-expand.cc > > > +++ b/gcc/config/i386/i386-expand.cc > > > @@ -14152,6 +14152,25 @@ ix86_expand_built
Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics
On Sun, Feb 25, 2024 at 6:03 PM Hongtao Liu wrote: > > On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu wrote: > > > > ldtilecfg and sttilecfg take a 512-byte memory block. With > > _tile_loadconfig implemented as > > > > extern __inline void > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > _tile_loadconfig (const void *__config) > > { > > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); > > } > > > > GCC sees: > > > > (parallel [ > > (asm_operands/v ("ldtilecfg %X0") ("") 0 > >[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars) > > (const_int -64 [0xffc0])) [1 > > MEM[(const void * *)&tile_data]+0 S8 A128])] > >[(asm_input:DI ("m"))] > >(clobber (reg:CC 17 flags))]) > > > > and the memory operand size is 1 byte. As the result, the rest of 511 > > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics > > with a pointer to BLKmode to honor the 512-byte memory block. > > > > gcc/ChangeLog: > > > > PR target/114098 > > * config/i386/amxtileintrin.h (_tile_loadconfig): Use > > __builtin_ia32_ldtilecfg. > > (_tile_storeconfig): Use __builtin_ia32_sttilecfg. > > * config/i386/i386-builtin.def (BDESC): Add > > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg. > > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle > > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG. > > * config/i386/i386.md (ldtilecfg): New pattern. > > (sttilecfg): Likewise. > > > > gcc/testsuite/ChangeLog: > > > > PR target/114098 > > * gcc.target/i386/amxtile-4.c: New test. > > --- > > gcc/config/i386/amxtileintrin.h | 4 +- > > gcc/config/i386/i386-builtin.def | 4 ++ > > gcc/config/i386/i386-expand.cc| 19 > > gcc/config/i386/i386.md | 24 ++ > > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++ > > 5 files changed, 104 insertions(+), 2 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c > > > > diff --git a/gcc/config/i386/amxtileintrin.h > > b/gcc/config/i386/amxtileintrin.h > > index d1a26e0fea5..5081b326498 100644 > > --- a/gcc/config/i386/amxtileintrin.h > > +++ b/gcc/config/i386/amxtileintrin.h > > @@ -39,14 +39,14 @@ extern __inline void > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > _tile_loadconfig (const void *__config) > > { > > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); > > + __builtin_ia32_ldtilecfg (__config); > > } > > > > extern __inline void > > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > > _tile_storeconfig (void *__config) > > { > > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); > > + __builtin_ia32_sttilecfg (__config); > > } > > > > extern __inline void > > diff --git a/gcc/config/i386/i386-builtin.def > > b/gcc/config/i386/i386-builtin.def > > index 729355230b8..88dd7f8857f 100644 > > --- a/gcc/config/i386/i386-builtin.def > > +++ b/gcc/config/i386/i386-builtin.def > > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, > > 0, CODE_FOR_nothing, "__b > > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, > > CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, > > CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, > > UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) > > > > +/* LDFILECFG and STFILECFG. */ > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, > > UNKNOWN, (int) VOID_FTYPE_PCVOID) > > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, > > CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, > > UNKNOWN, (int) VOID_FTYPE_PVOID) > CODE_FOR_sttilecfg. It is unused. I changed both to CODE_FOR_nothing. > > + > > /* SSE */ > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, > > "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) > > VOID_FTYPE_PFLOAT_V4SF) > > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, > > "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) > > VOID_FTYPE_PFLOAT_V4SF) > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > > index a4d3369f01b..17993eb837f 100644 > > --- a/gcc/config/i386/i386-expand.cc > > +++ b/gcc/config/i386/i386-expand.cc > > @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx > > subtarget, > > emit_insn (pat); > >return 0; > > > > +case IX86_BUILTIN_LDTILECFG: > > +case IX86_BUILTIN_STTILECFG: > > + arg0 = CALL_EXPR_ARG (exp, 0); > > + op0 = expand_normal (arg0); > > + > > + if (!address_oper
Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics
On Mon, Feb 26, 2024 at 5:11 AM H.J. Lu wrote: > > ldtilecfg and sttilecfg take a 512-byte memory block. With > _tile_loadconfig implemented as > > extern __inline void > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > _tile_loadconfig (const void *__config) > { > __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); > } > > GCC sees: > > (parallel [ > (asm_operands/v ("ldtilecfg %X0") ("") 0 >[(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars) > (const_int -64 [0xffc0])) [1 MEM[(const > void * *)&tile_data]+0 S8 A128])] >[(asm_input:DI ("m"))] >(clobber (reg:CC 17 flags))]) > > and the memory operand size is 1 byte. As the result, the rest of 511 > bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics > with a pointer to BLKmode to honor the 512-byte memory block. > > gcc/ChangeLog: > > PR target/114098 > * config/i386/amxtileintrin.h (_tile_loadconfig): Use > __builtin_ia32_ldtilecfg. > (_tile_storeconfig): Use __builtin_ia32_sttilecfg. > * config/i386/i386-builtin.def (BDESC): Add > __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg. > * config/i386/i386-expand.cc (ix86_expand_builtin): Handle > IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG. > * config/i386/i386.md (ldtilecfg): New pattern. > (sttilecfg): Likewise. > > gcc/testsuite/ChangeLog: > > PR target/114098 > * gcc.target/i386/amxtile-4.c: New test. > --- > gcc/config/i386/amxtileintrin.h | 4 +- > gcc/config/i386/i386-builtin.def | 4 ++ > gcc/config/i386/i386-expand.cc| 19 > gcc/config/i386/i386.md | 24 ++ > gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++ > 5 files changed, 104 insertions(+), 2 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c > > diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h > index d1a26e0fea5..5081b326498 100644 > --- a/gcc/config/i386/amxtileintrin.h > +++ b/gcc/config/i386/amxtileintrin.h > @@ -39,14 +39,14 @@ extern __inline void > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > _tile_loadconfig (const void *__config) > { > - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); > + __builtin_ia32_ldtilecfg (__config); > } > > extern __inline void > __attribute__((__gnu_inline__, __always_inline__, __artificial__)) > _tile_storeconfig (void *__config) > { > - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); > + __builtin_ia32_sttilecfg (__config); > } > > extern __inline void > diff --git a/gcc/config/i386/i386-builtin.def > b/gcc/config/i386/i386-builtin.def > index 729355230b8..88dd7f8857f 100644 > --- a/gcc/config/i386/i386-builtin.def > +++ b/gcc/config/i386/i386-builtin.def > @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, > 0, CODE_FOR_nothing, "__b > BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, > "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) > VOID_FTYPE_PVOID_INT64) > BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, > "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) > VOID_FTYPE_PVOID_INT64) > > +/* LDFILECFG and STFILECFG. */ > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, > "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) > VOID_FTYPE_PCVOID) > +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, > "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) > VOID_FTYPE_PVOID) CODE_FOR_sttilecfg. > + > /* SSE */ > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, > "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) > VOID_FTYPE_PFLOAT_V4SF) > BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, > "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) > VOID_FTYPE_PFLOAT_V4SF) > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index a4d3369f01b..17993eb837f 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx > subtarget, > emit_insn (pat); >return 0; > > +case IX86_BUILTIN_LDTILECFG: > +case IX86_BUILTIN_STTILECFG: > + arg0 = CALL_EXPR_ARG (exp, 0); > + op0 = expand_normal (arg0); > + > + if (!address_operand (op0, VOIDmode)) > + { > + op0 = convert_memory_address (Pmode, op0); > + op0 = copy_addr_to_reg (op0); > + } > + op0 = gen_rtx_MEM (BLKmode, op0); maybe we can just use XImode, and adjust the patterns with XI. > + if (fcode == IX86_BUILTIN_LDTILECFG) > + icode = CODE_FOR_ldtilecfg; > + else >
Re: [PATCH] x86: Properly implement AMX-TILE load/store intrinsics
Thanks for fixing this! Didn't notice that the pointer conversion can cause this issue... Was it possible to use local array like char a[64] = (char *)p __asm__ volatile ("ldtilecfg\t%X0" :: "m" (a))); If not, for the two patterns we can use "m" instead of "jm" as APX supports EGPR extension for AMX.
[PATCH] x86: Properly implement AMX-TILE load/store intrinsics
ldtilecfg and sttilecfg take a 512-byte memory block. With _tile_loadconfig implemented as extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _tile_loadconfig (const void *__config) { __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); } GCC sees: (parallel [ (asm_operands/v ("ldtilecfg %X0") ("") 0 [(mem/f/c:DI (plus:DI (reg/f:DI 77 virtual-stack-vars) (const_int -64 [0xffc0])) [1 MEM[(const void * *)&tile_data]+0 S8 A128])] [(asm_input:DI ("m"))] (clobber (reg:CC 17 flags))]) and the memory operand size is 1 byte. As the result, the rest of 511 bytes is ignored by GCC. Implement ldtilecfg and sttilecfg intrinsics with a pointer to BLKmode to honor the 512-byte memory block. gcc/ChangeLog: PR target/114098 * config/i386/amxtileintrin.h (_tile_loadconfig): Use __builtin_ia32_ldtilecfg. (_tile_storeconfig): Use __builtin_ia32_sttilecfg. * config/i386/i386-builtin.def (BDESC): Add __builtin_ia32_ldtilecfg and __builtin_ia32_sttilecfg. * config/i386/i386-expand.cc (ix86_expand_builtin): Handle IX86_BUILTIN_LDTILECFG and IX86_BUILTIN_STTILECFG. * config/i386/i386.md (ldtilecfg): New pattern. (sttilecfg): Likewise. gcc/testsuite/ChangeLog: PR target/114098 * gcc.target/i386/amxtile-4.c: New test. --- gcc/config/i386/amxtileintrin.h | 4 +- gcc/config/i386/i386-builtin.def | 4 ++ gcc/config/i386/i386-expand.cc| 19 gcc/config/i386/i386.md | 24 ++ gcc/testsuite/gcc.target/i386/amxtile-4.c | 55 +++ 5 files changed, 104 insertions(+), 2 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/amxtile-4.c diff --git a/gcc/config/i386/amxtileintrin.h b/gcc/config/i386/amxtileintrin.h index d1a26e0fea5..5081b326498 100644 --- a/gcc/config/i386/amxtileintrin.h +++ b/gcc/config/i386/amxtileintrin.h @@ -39,14 +39,14 @@ extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _tile_loadconfig (const void *__config) { - __asm__ volatile ("ldtilecfg\t%X0" :: "m" (*((const void **)__config))); + __builtin_ia32_ldtilecfg (__config); } extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _tile_storeconfig (void *__config) { - __asm__ volatile ("sttilecfg\t%X0" : "=m" (*((void **)__config))); + __builtin_ia32_sttilecfg (__config); } extern __inline void diff --git a/gcc/config/i386/i386-builtin.def b/gcc/config/i386/i386-builtin.def index 729355230b8..88dd7f8857f 100644 --- a/gcc/config/i386/i386-builtin.def +++ b/gcc/config/i386/i386-builtin.def @@ -126,6 +126,10 @@ BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__b BDESC (OPTION_MASK_ISA_XSAVES | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xrstors64", IX86_BUILTIN_XRSTORS64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) BDESC (OPTION_MASK_ISA_XSAVEC | OPTION_MASK_ISA_64BIT, 0, CODE_FOR_nothing, "__builtin_ia32_xsavec64", IX86_BUILTIN_XSAVEC64, UNKNOWN, (int) VOID_FTYPE_PVOID_INT64) +/* LDFILECFG and STFILECFG. */ +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_ldtilecfg", IX86_BUILTIN_LDTILECFG, UNKNOWN, (int) VOID_FTYPE_PCVOID) +BDESC (OPTION_MASK_ISA_64BIT, OPTION_MASK_ISA2_AMX_TILE, CODE_FOR_ldtilecfg, "__builtin_ia32_sttilecfg", IX86_BUILTIN_STTILECFG, UNKNOWN, (int) VOID_FTYPE_PVOID) + /* SSE */ BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_movv4sf_internal, "__builtin_ia32_storeups", IX86_BUILTIN_STOREUPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF) BDESC (OPTION_MASK_ISA_SSE, 0, CODE_FOR_sse_movntv4sf, "__builtin_ia32_movntps", IX86_BUILTIN_MOVNTPS, UNKNOWN, (int) VOID_FTYPE_PFLOAT_V4SF) diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index a4d3369f01b..17993eb837f 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -14152,6 +14152,25 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget, emit_insn (pat); return 0; +case IX86_BUILTIN_LDTILECFG: +case IX86_BUILTIN_STTILECFG: + arg0 = CALL_EXPR_ARG (exp, 0); + op0 = expand_normal (arg0); + + if (!address_operand (op0, VOIDmode)) + { + op0 = convert_memory_address (Pmode, op0); + op0 = copy_addr_to_reg (op0); + } + op0 = gen_rtx_MEM (BLKmode, op0); + if (fcode == IX86_BUILTIN_LDTILECFG) + icode = CODE_FOR_ldtilecfg; + else + icode = CODE_FOR_sttilecfg; + pat = GEN_FCN (icode) (op0); + emit_insn (pat); + return 0; + case IX86_BUILTIN_LLWPCB: arg0 = CALL_EXPR_ARG (exp, 0); op0 = expand_normal (arg0); diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md index 6a26d966a0e..0ede6adac2f 100644 --- a/gcc/config/i386/i386.md +++ b/gcc/confi