| Issue |
76948
|
| Summary |
f16 rsqrt pattern fails to use v_rsq_f32 on targets without legal f16
|
| Labels |
backend:AMDGPU,
code-quality
|
| Assignees |
|
| Reporter |
arsenm
|
```
define hidden noundef half @test_rsqrt_f16(half noundef %x) local_unnamed_addr #0 {
entry:
%0 = tail call contract half @llvm.sqrt.f16(half %x)
%1 = fdiv contract half 0xH3C00, %0
ret half %1
}
```
This fails to make use of v_rsq_f32 on gfx6/gfx7. gfx8+ fuses this to just v_rsq_f16. -mcpu=tahiti gets a full fdiv expansion:
```
test_rsqrt_f16: ; @test_rsqrt_f16
s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
v_cvt_f16_f32_e32 v0, v0
s_mov_b32 s6, s33
s_mov_b32 s33, s32
s_mov_b32 s33, s6
v_cvt_f32_f16_e32 v0, v0
v_sqrt_f32_e32 v0, v0
v_cvt_f16_f32_e32 v0, v0
v_cvt_f32_f16_e32 v0, v0
v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
v_rcp_f32_e32 v2, v1
v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
v_fma_f32 v4, -v1, v2, 1.0
v_fma_f32 v2, v4, v2, v2
v_mul_f32_e32 v4, v3, v2
v_fma_f32 v5, -v1, v4, v3
v_fma_f32 v4, v5, v2, v4
v_fma_f32 v1, -v1, v4, v3
v_div_fmas_f32 v1, v1, v2, v4
v_div_fixup_f32 v0, v1, v0, 1.0
s_setpc_b64 s[30:31]
```
_______________________________________________
llvm-bugs mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-bugs