Author: Bradley Smith Date: 2021-09-09T09:05:58-07:00 New Revision: 0c2f8599a9cc42014b41aadf7c308b4889d2397f
URL: https://github.com/llvm/llvm-project/commit/0c2f8599a9cc42014b41aadf7c308b4889d2397f DIFF: https://github.com/llvm/llvm-project/commit/0c2f8599a9cc42014b41aadf7c308b4889d2397f.diff LOG: Workaround incorrect types when lowering fixed length gather/scatter When lowering a fixed length gather/scatter the index type is assumed to be the same as the memory type, this is incorrect in cases where the extension of the index has been folded into the addressing mode. For now add a temporary workaround to fix the codegen faults caused by this by preventing the removal of this extension. At a later date the lowering for SVE gather/scatters will be redesigned to improve the way addressing modes are handled. As a short term side effect of this change, the addressing modes generated for fixed length gather/scatters will not be optimal. Differential Revision: https://reviews.llvm.org/D109145 (cherry picked from commit 14e1a4a6eef2fb95ec852c9ddfc597f80bba3226) Added: Modified: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll Removed: ################################################################################ diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 60c00f47859b0..494554ae7b331 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4161,7 +4161,8 @@ bool AArch64TargetLowering::shouldExtendGSIndex(EVT VT, EVT &EltTy) const { bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const { if (VT.getVectorElementType() == MVT::i32 && - VT.getVectorElementCount().getKnownMinValue() >= 4) + VT.getVectorElementCount().getKnownMinValue() >= 4 && + !VT.isFixedLengthVector()) return true; return false; diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll index ee64e6db65689..a52d3e91fac38 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -917,6 +917,7 @@ define void @masked_gather_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 { ; The above tests test the types, the below tests check that the addressing ; modes still function +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { ; CHECK-LABEL: masked_gather_32b_scaled_sext_f16: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32 @@ -925,11 +926,15 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h -; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 -; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #1] -; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h -; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[SEXT]].d, lsl #1] +; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s +; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h +; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0] ; VBITS_GE_2048-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -941,14 +946,21 @@ define void @masked_gather_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 { ; CHECK-LABEL: masked_gather_32b_scaled_sext_f32: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG]]/z, [[VALS]].s, #0.0 -; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #2] -; VBITS_GE_2048-NEXT: st1w { [[RES]].s }, [[PG]], [x0] +; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32 +; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG0]]/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[SEXT]].d, lsl #2] +; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s +; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] ; VBITS_GE_2048-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -960,14 +972,16 @@ define void @masked_gather_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 { ; CHECK-LABEL: masked_gather_32b_scaled_sext_f64: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32 -; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 +; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 -; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, sxtw #3] +; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: ld1d { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[SEXT]].d, lsl #3] ; VBITS_GE_2048-NEXT: st1d { [[RES]].d }, [[PG0]], [x0] ; VBITS_GE_2048-NEXT: ret %cvals = load <32 x double>, <32 x double>* %a @@ -980,6 +994,7 @@ define void @masked_gather_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { ; CHECK-LABEL: masked_gather_32b_scaled_zext: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32 @@ -988,11 +1003,15 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h -; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 -; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw #1] -; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h -; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: uunpklo [[ZEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[ZEXT]].d, lsl #1] +; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s +; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h +; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0] ; VBITS_GE_2048-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -1004,6 +1023,7 @@ define void @masked_gather_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { ; CHECK-LABEL: masked_gather_32b_unscaled_sext: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32 @@ -1012,11 +1032,15 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h -; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 -; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw] -; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h -; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[SEXT]].d] +; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s +; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h +; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0] ; VBITS_GE_2048-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -1029,6 +1053,7 @@ define void @masked_gather_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8 ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { ; CHECK-LABEL: masked_gather_32b_unscaled_zext: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32 @@ -1037,11 +1062,15 @@ define void @masked_gather_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h -; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 -; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw] -; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h -; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: uunpklo [[ZEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[ZEXT]].d] +; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s +; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h +; VBITS_GE_2048-NEXT: st1h { [[UZP2]].h }, [[PG0]], [x0] ; VBITS_GE_2048-NEXT: ret %cvals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll index 2ce98f00687df..bfad86a2c0aa6 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -839,18 +839,24 @@ define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 { ; The above tests test the types, the below tests check that the addressing ; modes still function + +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f16: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h -; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1] +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[SEXT]].d, lsl #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -861,13 +867,20 @@ define void @masked_scatter_32b_scaled_sext_f16(<32 x half>* %a, <32 x i32>* %b, ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b, float* %base) #0 { ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f32: -; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl32 -; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG]]/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG]]/z, [[VALS]].s, #0.0 -; VBITS_GE_2048-NEXT: st1w { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #2] +; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32 +; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] +; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG0]]/z, [x1] +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], [x2, [[SEXT]].d, lsl #2] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -878,14 +891,16 @@ define void @masked_scatter_32b_scaled_sext_f32(<32 x float>* %a, <32 x i32>* %b ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* %b, double* %base) #0 { ; CHECK-LABEL: masked_scatter_32b_scaled_sext_f64: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32 -; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 +; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0.0 -; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d, sxtw #3] +; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], [x2, [[SEXT]].d, lsl #3] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x double>, <32 x double>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -896,18 +911,23 @@ define void @masked_scatter_32b_scaled_sext_f64(<32 x double>* %a, <32 x i32>* % ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 { ; CHECK-LABEL: masked_scatter_32b_scaled_zext: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h -; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1] +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: uunpklo [[ZEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[ZEXT]].d, lsl #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -918,18 +938,23 @@ define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, hal ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { ; CHECK-LABEL: masked_scatter_32b_unscaled_sext: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h -; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw] +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: sunpklo [[SEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[SEXT]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -941,18 +966,23 @@ define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i ret void } +; NOTE: This produces an non-optimal addressing mode due to a temporary workaround define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 { ; CHECK-LABEL: masked_scatter_32b_unscaled_zext: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32 ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] +; VBITS_GE_2048-NEXT: ptrue [[PG2:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 ; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h -; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw] +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: uunpklo [[ZEXT:z[0-9]+]].d, [[PTRS]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG2]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [x2, [[ZEXT]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits