https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95796
Bug ID: 95796
Summary: Inlining works between functions with the same target
attribute but not target_clones
Product: gcc
Version: 10.1.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: ipa
Assignee: unassigned at gcc dot gnu.org
Reporter: yyc1992 at gmail dot com
CC: marxin at gcc dot gnu.org
Target Milestone: ---
If two functions with the same target attribute calls each other, GCC can
inline one into another one (although sometimes incorrectly... PR95790). This
can be shown with the following code (all compilation using `g++ -O2 -S
-fno-exceptions -fno-asynchronous-unwind-tables`).
```
__attribute__ ((target ("default")))
static unsigned foo()
{
return 1;
}
__attribute__ ((target ("avx")))
static unsigned foo() {
return 1;
}
__attribute__ ((target ("default")))
unsigned bar()
{
return foo();
}
__attribute__ ((target ("avx")))
unsigned bar()
{
return foo();
}
```
which is compiled to
```
.text
.p2align 4
.globl _Z3barv
.type _Z3barv, @function
_Z3barv:
movl $1, %eax
ret
.size _Z3barv, .-_Z3barv
.p2align 4
.globl _Z3barv.avx
.type _Z3barv.avx, @function
_Z3barv.avx:
movl $1, %eax
ret
.size _Z3barv.avx, .-_Z3barv.avx
```
OTOH, the equivalent code using `target_clones`
```
__attribute__ ((target_clones ("default,avx")))
static unsigned foo()
{
return 1;
}
__attribute__ ((target_clones ("default,avx")))
unsigned bar()
{
return foo();
}
```
compiles to
```
.text
.p2align 4
.type _ZL3foov.default.1, @function
_ZL3foov.default.1:
movl $1, %eax
ret
.size _ZL3foov.default.1, .-_ZL3foov.default.1
.p2align 4
.type _Z3barv.default.1, @function
_Z3barv.default.1:
jmp _ZL3foov.default.1
.size _Z3barv.default.1, .-_Z3barv.default.1
.p2align 4
.type _ZL3foov.avx.0, @function
_ZL3foov.avx.0:
movl $1, %eax
ret
.size _ZL3foov.avx.0, .-_ZL3foov.avx.0
.p2align 4
.type _Z3barv.avx.0, @function
_Z3barv.avx.0:
jmp _ZL3foov.avx.0
.size _Z3barv.avx.0, .-_Z3barv.avx.0
.section
.text._Z3barv.resolver,"axG",@progbits,_Z3barv.resolver,comdat
.p2align 4
.weak _Z3barv.resolver
.type _Z3barv.resolver, @function
_Z3barv.resolver:
subq $8, %rsp
call __cpu_indicator_init@PLT
movq __cpu_model@GOTPCREL(%rip), %rax
leaq _Z3barv.avx.0(%rip), %rdx
testb $2, 13(%rax)
leaq _Z3barv.default.1(%rip), %rax
cmovne %rdx, %rax
addq $8, %rsp
ret
.size _Z3barv.resolver, .-_Z3barv.resolver
.globl _Z3barv
.type _Z3barv, @gnu_indirect_function
.set _Z3barv,_Z3barv.resolver
.text
.p2align 4
.type _ZL3foov.resolver, @function
_ZL3foov.resolver:
subq $8, %rsp
call __cpu_indicator_init@PLT
movq __cpu_model@GOTPCREL(%rip), %rax
leaq _ZL3foov.avx.0(%rip), %rdx
testb $2, 13(%rax)
leaq _ZL3foov.default.1(%rip), %rax
cmovne %rdx, %rax
addq $8, %rsp
ret
.size _ZL3foov.resolver, .-_ZL3foov.resolver
```
instead. Which only eliminates the indirect call but does not inline `foo` into
`bar`. (Note that the useless resolver for foo is PR95779). I believe the two
versions should behave the same...
Ref PR95778 (PLT elimination)
Ref PR71990 (similar title but different. That one is about inlining of the
dispatcher itself IIUC and is not about the case that can already be statically
dispatched. It is also not specific to target_clones like this one is)