https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125941
Bug ID: 125941
Summary: AArch64: Inefficient code generation for non-constant
svbool_t initializer
Product: gcc
Version: 17.0
Status: UNCONFIRMED
Severity: normal
Priority: P3
Component: target
Assignee: unassigned at gcc dot gnu.org
Reporter: Chris.Bazley at arm dot com
Target Milestone: ---
The store_constructor function handles variable-length vector types
inefficiently using a fallback path. One source of such types is ongoing work
to enabled predicated SLP vectorisation of basic blocks; another is an existing
test.
The cause is twofold:
1. The number of subparts in a variable-length vector type is not divisible by
the number of subparts of the element type.
2. convert_optab_handler cannot retrieve an insn code for vec_init that
converts from E_QImode to E_VNx16BImode.
Addressing either of these causes alone is insufficient. If both can be fixed,
then store_constructor should assert that a suitable insn code is available for
any uses of vec_init.
Reproducer:
make check-gcc RUNTESTFLAGS="aarch64-sve-acle.exp=cops_bool.c"
Variable values:
mode = E_VNx16BImode
eltmode = E_QImode
icode = CODE_FOR_nothing
Backtrace:
The source code for this test is macro-heavy. It includes the following:
svbool_t __attribute__ ((noipa)) \
func_init4 () { \
svbool_t temp = VECT_CSTN; \
return temp; \
} \
and:
#define VECT_CSTN { -1, t (), 0, -1, 0, f (), 0, 0, 0, -1, 0, -1, 0, -1, 0, -1
} /* { dg-warning "overflow in conversion from" "" { target c } } */
/* { dg-warning "narrowing conversion of" "" { target c++ } .-1 } */
which compiles to this GIMPLE:
__attribute__((noipa, noinline, noclone, no_icf))
svbool_t func_init4 ()
{
svbool_t temp;
int _1;
<signed-boolean:1> _2;
int _3;
<signed-boolean:1> _4;
<bb 2> [local count: 1073741824]:
_1 = t ();
_2 = _1 != 0;
_3 = f ();
_4 = _3 != 0;
temp_8 = {-1, _2, 0, -1, 0, _4, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1};
return temp_8;
}
which is lowered to very inefficient-looking AArch64 code:
to very inefficient-looking code:
func_init4:
.LFB24:
.cfi_startproc
addvl sp, sp, #-1
.cfi_escape 0xf,0x8,0x8f,0,0x92,0x2e,0,0x38,0x1e,0x22
sub sp, sp, #32
.cfi_escape 0xf,0xa,0x8f,0,0x92,0x2e,0,0x38,0x1e,0x23,0x20,0x22
stp x29, x30, [sp]
.cfi_escape 0x10,0x1d,0x2,0x8f,0
.cfi_escape 0x10,0x1e,0x2,0x8f,0x8
mov x29, sp
str x19, [sp, 16]
addvl sp, sp, #-18
.cfi_escape 0xf,0xb,0x8f,0,0x92,0x2e,0,0x8,0x98,0x1e,0x23,0x20,0x22
.cfi_escape 0x10,0x13,0xb,0x8f,0,0x92,0x2e,0,0x8,0x90,0x1e,0x23,0x10,0x22
str z8, [sp, #2, mul vl]
str z9, [sp, #3, mul vl]
str z10, [sp, #4, mul vl]
str z11, [sp, #5, mul vl]
str p15, [sp, #11, mul vl]
str p5, [sp, #1, mul vl]
str p6, [sp, #2, mul vl]
str p7, [sp, #3, mul vl]
str p8, [sp, #4, mul vl]
str p9, [sp, #5, mul vl]
str p10, [sp, #6, mul vl]
str p11, [sp, #7, mul vl]
str p12, [sp, #8, mul vl]
str p13, [sp, #9, mul vl]
str p14, [sp, #10, mul vl]
str z12, [sp, #6, mul vl]
str z13, [sp, #7, mul vl]
str z14, [sp, #8, mul vl]
str z15, [sp, #9, mul vl]
str z16, [sp, #10, mul vl]
str z17, [sp, #11, mul vl]
str z18, [sp, #12, mul vl]
str z19, [sp, #13, mul vl]
str z20, [sp, #14, mul vl]
str z21, [sp, #15, mul vl]
str p4, [sp]
str z22, [sp, #16, mul vl]
str z23, [sp, #17, mul vl]
.cfi_escape 0x10,0x48,0x8,0x8f,0,0x92,0x2e,0,0x40,0x1e,0x22
.cfi_escape 0x10,0x49,0x8,0x8f,0,0x92,0x2e,0,0x48,0x1e,0x22
.cfi_escape 0x10,0x4a,0x9,0x8f,0,0x92,0x2e,0,0x8,0x20,0x1e,0x22
.cfi_escape 0x10,0x4b,0x9,0x8f,0,0x92,0x2e,0,0x8,0x28,0x1e,0x22
.cfi_escape 0x10,0x4c,0x9,0x8f,0,0x92,0x2e,0,0x8,0x30,0x1e,0x22
.cfi_escape 0x10,0x4d,0x9,0x8f,0,0x92,0x2e,0,0x8,0x38,0x1e,0x22
.cfi_escape 0x10,0x4e,0x9,0x8f,0,0x92,0x2e,0,0x8,0x40,0x1e,0x22
.cfi_escape 0x10,0x4f,0x9,0x8f,0,0x92,0x2e,0,0x8,0x48,0x1e,0x22
bl t
mov w19, w0
bl f
mov w2, 151
cntd x1
pfalse p15.b
mov w3, 151
cmp w19, 0
mul x1, x1, x2
cntd x2
mul x2, x2, x3
add x1, x1, 32
add x1, sp, x1
str p15, [x1]
add x2, sp, x2
mov w1, 1
strh w1, [x2, 32]
cntd x1
rdvl x2, #20
mul x1, x1, x3
rdvl x3, #20
add x1, x1, 32
add x1, sp, x1
ldr p15, [x1]
addpl x1, x2, #-10
rdvl x2, #20
add x1, x1, 32
add x1, sp, x1
str p15, [x1]
addpl x1, x2, #-10
csetm w2, ne
add x1, x1, 32
cmp w0, 0
ldrh w1, [sp, x1]
bfi w1, w2, 1, 1
addpl x2, x3, #-10
add x2, x2, 32
add x2, sp, x2
strh w1, [x2]
addpl x1, x3, #-10
add x1, x1, 32
mov w2, 149
add x1, sp, x1
ldr p15, [x1]
mov w3, 149
cntd x1
mul x1, x1, x2
add x1, x1, 32
add x1, sp, x1
str p15, [x1]
etc..