================
@@ -150,13 +150,14 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) #0 {
 ; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
 ; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0]
 ; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
-; VBITS_GE_256-NEXT:    uunpklo z1.s, z0.h
-; VBITS_GE_256-NEXT:    ext z0.b, z0.b, z0.b, #16
+; VBITS_GE_256-NEXT:    movprfx z1, z0
+; VBITS_GE_256-NEXT:    ext z1.b, z1.b, z0.b, #16
 ; VBITS_GE_256-NEXT:    uunpklo z0.s, z0.h
-; VBITS_GE_256-NEXT:    fcvtzu z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT:    uunpklo z1.s, z1.h
 ; VBITS_GE_256-NEXT:    fcvtzu z0.s, p0/m, z0.h
-; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1]
-; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1, x8, lsl #2]
+; VBITS_GE_256-NEXT:    fcvtzu z1.s, p0/m, z1.h
+; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x1]
+; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x1, x8, lsl #2]
----------------
gbossu wrote:

In that example, we do get one more instruction now (the `movprfx`), but I 
think the schedule is actually better because we eliminate one dependency 
between `ext` and the second `uunpklo`. Now the two `uunpklo` can execute in 
parallel.

This is is the theme of the test updates in general: Sometimes more 
instructions, but more freedom for the `MachineScheduler`

https://github.com/llvm/llvm-project/pull/152554
_______________________________________________
llvm-branch-commits mailing list
llvm-branch-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits

Reply via email to