================ @@ -150,13 +150,14 @@ define void @fcvtzu_v16f16_v16i32(ptr %a, ptr %b) #0 { ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 ; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] ; VBITS_GE_256-NEXT: ptrue p0.s, vl8 -; VBITS_GE_256-NEXT: uunpklo z1.s, z0.h -; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16 +; VBITS_GE_256-NEXT: movprfx z1, z0 +; VBITS_GE_256-NEXT: ext z1.b, z1.b, z0.b, #16 ; VBITS_GE_256-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.h +; VBITS_GE_256-NEXT: uunpklo z1.s, z1.h ; VBITS_GE_256-NEXT: fcvtzu z0.s, p0/m, z0.h -; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] -; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] +; VBITS_GE_256-NEXT: fcvtzu z1.s, p0/m, z1.h +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] +; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1, x8, lsl #2] ---------------- gbossu wrote:
In that example, we do get one more instruction now (the `movprfx`), but I think the schedule is actually better because we eliminate one dependency between `ext` and the second `uunpklo`. Now the two `uunpklo` can execute in parallel. This is is the theme of the test updates in general: Sometimes more instructions, but more freedom for the `MachineScheduler` https://github.com/llvm/llvm-project/pull/152554 _______________________________________________ llvm-branch-commits mailing list llvm-branch-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits