[PATCH] D72934: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq intrinsics.

2020-01-23 Thread Simon Tatham via Phabricator via cfe-commits
This revision was automatically updated to reflect the committed changes.
Closed by commit rG4321c6af28e9: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq 
intrinsics. (authored by simon_tatham).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72934/new/

https://reviews.llvm.org/D72934

Files:
  clang/include/clang/Basic/arm_mve.td
  clang/include/clang/Basic/arm_mve_defs.td
  clang/include/clang/Sema/Sema.h
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c
  clang/test/Sema/arm-mve-immediates.c
  clang/utils/TableGen/MveEmitter.cpp
  llvm/lib/Target/ARM/ARMISelLowering.cpp
  llvm/lib/Target/ARM/ARMInstrInfo.td
  llvm/lib/Target/ARM/ARMInstrMVE.td
  llvm/lib/Target/ARM/ARMInstrNEON.td
  llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll

Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll
===
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll
@@ -0,0 +1,365 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_n_u16_sh0(<8 x i16> %a) {
+; CHECK-LABEL: test_vbicq_n_u16_sh0:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i16 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <8 x i16> %a, 
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_n_u16_sh8(<8 x i16> %a) {
+; CHECK-LABEL: test_vbicq_n_u16_sh8:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i16 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <8 x i16> %a, 
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh0(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh0:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i32 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh8(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh8:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i32 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh16(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh16:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i32 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh24(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh24:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i32 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+; The immediate in this case is legal for a VMVN but not for a VBIC,
+; so in this case we expect to see the constant being prepared in
+; another register.
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_illegal(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_illegal:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmvn.i32 q1, #0x54ff
+; CHECK-NEXT:vand q0, q0, q1
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_n_u16_sh0(<8 x i16> %a) {
+; CHECK-LABEL: test_vorrq_n_u16_sh0:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vorr.i16 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = or <8 x i16> %a, 
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_n_u16_sh8(<8 x i16> %a) {
+; CHECK-LABEL: test_vorrq_n_u16_sh8:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vorr.i16 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = or <8 x i16> %a, 
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh0(<4 x i32> %a) {
+; CHECK-LABEL: test_vorrq_n_u32_sh0:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vorr.i32 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = or <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh8(<4 x i32> %a) {
+; CHECK-LABEL: test_vorrq_n_u32_sh8:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vorr.i32 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = or <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh16(<4 x i32> %a) {
+; CHECK-LABEL: test_vorrq_n_u32_sh16:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vorr.i32 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = or <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh24(<4 x i32> %a) {
+; CHECK-LABEL: test_vorrq_n_u32_sh24:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vorr.i32 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = or <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_m_n_u16_sh0(<8 x i16> %a, i16 zeroext %p) {

[PATCH] D72934: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq intrinsics.

2020-01-23 Thread Dave Green via Phabricator via cfe-commits
dmgreen accepted this revision.
dmgreen added a comment.
This revision is now accepted and ready to land.

Looks good, from what I can tell.

I especially like the selects. We know that we have to do more work there, but 
adding this for more instructions would go a long way towards creating more 
predicated instructions (before the ability to do this in IR comes along).


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72934/new/

https://reviews.llvm.org/D72934



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72934: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq intrinsics.

2020-01-22 Thread Simon Tatham via Phabricator via cfe-commits
simon_tatham updated this revision to Diff 239609.
simon_tatham marked an inline comment as done.
simon_tatham edited the summary of this revision.
simon_tatham added a comment.

I've revised the MC representations of VBIC and VORR as suggested, but that was 
a big enough patch that I've done it separately as D73205 
. This patch now sits on top of that one.

Changing VBIC and VORR meant I could do the isel for the unpredicated forms in 
pure Tablegen. But the predicated ones would still have needed C++, because the 
IR intrinsics would have wanted the immediate in its natural form, but by the 
time you generate an instruction, it has to be re-encoded as NEON. The simplest 
way was to stop adding new IR intrinsics, and instead encode the predicated 
instructions as a select. Then I still get to use isel lowering's conversion 
into VBICIMM/VORRIMM which does the immediate translation for me.

Adjusting the VMOVL pattern to expect the result of my modified lowering has 
made all those unrelated MVE codegen tests go back to the way they were before, 
so the new version of this patch doesn't have to change anything there.

Also added a negative llc test with an immediate that doesn't fit into VBICIMM, 
to prove that it gets sensibly selected as a different instruction sequence and 
nothing crashes.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72934/new/

https://reviews.llvm.org/D72934

Files:
  clang/include/clang/Basic/arm_mve.td
  clang/include/clang/Basic/arm_mve_defs.td
  clang/include/clang/Sema/Sema.h
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c
  clang/test/Sema/arm-mve-immediates.c
  clang/utils/TableGen/MveEmitter.cpp
  llvm/lib/Target/ARM/ARMISelLowering.cpp
  llvm/lib/Target/ARM/ARMInstrInfo.td
  llvm/lib/Target/ARM/ARMInstrMVE.td
  llvm/lib/Target/ARM/ARMInstrNEON.td
  llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll

Index: llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll
===
--- /dev/null
+++ llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll
@@ -0,0 +1,365 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_n_u16_sh0(<8 x i16> %a) {
+; CHECK-LABEL: test_vbicq_n_u16_sh0:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i16 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <8 x i16> %a, 
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vbicq_n_u16_sh8(<8 x i16> %a) {
+; CHECK-LABEL: test_vbicq_n_u16_sh8:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i16 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <8 x i16> %a, 
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh0(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh0:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i32 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh8(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh8:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i32 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh16(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh16:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i32 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh24(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_sh24:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vbic.i32 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+; The immediate in this case is legal for a VMVN but not for a VBIC,
+; so in this case we expect to see the constant being prepared in
+; another register.
+define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_illegal(<4 x i32> %a) {
+; CHECK-LABEL: test_vbicq_n_u32_illegal:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmvn.i32 q1, #0x54ff
+; CHECK-NEXT:vand q0, q0, q1
+; CHECK-NEXT:bx lr
+entry:
+  %0 = and <4 x i32> %a, 
+  ret <4 x i32> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_n_u16_sh0(<8 x i16> %a) {
+; CHECK-LABEL: test_vorrq_n_u16_sh0:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vorr.i16 q0, #0x64
+; CHECK-NEXT:bx lr
+entry:
+  %0 = or <8 x i16> %a, 
+  ret <8 x i16> %0
+}
+
+define arm_aapcs_vfpcc <8 x i16> @test_vorrq_n_u16_sh8(<8 x i16> %a) {
+; CHECK-LABEL: test_vorrq_n_u16_sh8:
+; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vorr.i16 q0, #0x6400
+; CHECK-NEXT:bx lr
+entry:
+  %0 = or <8 x i16> %a, 
+  ret <8 x i16> %0
+}
+

[PATCH] D72934: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq intrinsics.

2020-01-20 Thread Dave Green via Phabricator via cfe-commits
dmgreen added a comment.

In D72934#1829387 , @simon_tatham 
wrote:

> In D72934#1829331 , @dmgreen wrote:
>
> > What is the reason that this can't be lowered in tablegen, in the same way 
> > as the VMOVimm's are?
>
>
> In NEON, immediate VBIC is represented as a single MC instruction, which 
> takes its immediate operand already encoded into the NEON format (8 data 
> bits, op and cmode). That's the same format that `ARMISD::VBICIMM` has 
> encoded the operand in after lowering. So you only need one tablegen pattern, 
> which passes the immediate through unchanged between the input and output 
> SDNode types.
>
> In MVE, immediate VBIC is represented as four separate MC instructions, for 
> an 8-bit immediate shifted left by 0, 8, 16 or 24 bits. Each one takes the 
> immediate operand in the 'natural' form, i.e. the numerical value that would 
> be combined into the vector lane and shown in assembly. For example, 
> `MVE_VBICIZ16v4i32` takes an operand such as `0xab` which NEON VBIC would 
> represent as `0xab | (control bits << 8)`. So the C++ isel code I've written 
> has to undo the NEON encoding and turn it back into the 'natural' immediate 
> value plus a choice of which MVE opcode to use.
>
> I suppose an alternative would be to rework the MC representation of MVE 
> VBIC/VORR so that they look more like the NEON versions. I don't exactly know 
> why MVE was done differently in the first place (the commit here has my name 
> on it, but it was a team effort). One possibility is that the 
> pseudo-instruction reversed forms `vand` and `vorn` might be hard to 
> represent that way, but I don't know.


I believe that the downstream VMOVimm's were rewritten like this when the other 
BUILDVECTOR handling was added by DavidS.  If it is possible to structure this 
way for BIC's too, it sounds like it might be a little cleaner.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72934/new/

https://reviews.llvm.org/D72934



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72934: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq intrinsics.

2020-01-20 Thread Simon Tatham via Phabricator via cfe-commits
simon_tatham marked 2 inline comments as done.
simon_tatham added a comment.

In D72934#1829331 , @dmgreen wrote:

> What is the reason that this can't be lowered in tablegen, in the same way as 
> the VMOVimm's are?


In NEON, immediate VBIC is represented as a single MC instruction, which takes 
its immediate operand already encoded into the NEON format (8 data bits, op and 
cmode). That's the same format that `ARMISD::VBICIMM` has encoded the operand 
in after lowering. So you only need one tablegen pattern, which passes the 
immediate through unchanged between the input and output SDNode types.

In MVE, immediate VBIC is represented as four separate MC instructions, for an 
8-bit immediate shifted left by 0, 8, 16 or 24 bits. Each one takes the 
immediate operand in the 'natural' form, i.e. the numerical value that would be 
combined into the vector lane and shown in assembly. For example, 
`MVE_VBICIZ16v4i32` takes an operand such as `0xab` which NEON VBIC would 
represent as `0xab | (control bits << 8)`. So the C++ isel code I've written 
has to undo the NEON encoding and turn it back into the 'natural' immediate 
value plus a choice of which MVE opcode to use.

I suppose an alternative would be to rework the MC representation of MVE 
VBIC/VORR so that they look more like the NEON versions. I don't exactly know 
why MVE was done differently in the first place (the commit here has my name on 
it, but it was a team effort). One possibility is that the pseudo-instruction 
reversed forms `vand` and `vorn` might be hard to represent that way, but I 
don't know.

> Do you have any tests for what would be invalid bic values under MVE?

True, I suppose I could provide some immediates that are valid for other 
`VMOVModImmType`s, like `0xabff`, and make sure nothing goes wrong.




Comment at: llvm/lib/Target/ARM/ARMISelLowering.cpp:12181
   BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) 
{
 if (SplatBitSize <= 64) {
   EVT VbicVT;

dmgreen wrote:
> This is OK because we are passing OtherModImm to isVMOVModifiedImm, and MVE 
> supports the same patterns as NEON?
Yes: `OtherModImm` only matches values of the form '8-bit number shifted left 
by a multiple of 8 bits', which is just what MVE VBIC and VORR take as well.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72934/new/

https://reviews.llvm.org/D72934



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72934: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq intrinsics.

2020-01-20 Thread Dave Green via Phabricator via cfe-commits
dmgreen added a comment.

What is the reason that this can't be lowered in tablegen, in the same way as 
the VMOVimm's are?

For vbic vs vmovlb, the vmovlb does include a free register move, so may under 
some circumstances be slightly better. Like you say, it's mostly benign, but 
may be worth updating the MVE_VMOVL patterns.

Do you have any tests for what would be invalid bic values under MVE?




Comment at: llvm/lib/Target/ARM/ARMISelLowering.cpp:12181
   BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) 
{
 if (SplatBitSize <= 64) {
   EVT VbicVT;

This is OK because we are passing OtherModImm to isVMOVModifiedImm, and MVE 
supports the same patterns as NEON?


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D72934/new/

https://reviews.llvm.org/D72934



___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D72934: [ARM,MVE] Support immediate vbicq,vorrq,vmvnq intrinsics.

2020-01-17 Thread Simon Tatham via Phabricator via cfe-commits
simon_tatham created this revision.
simon_tatham added reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard.
Herald added subscribers: llvm-commits, cfe-commits, hiraditya, kristof.beyls.
Herald added projects: clang, LLVM.

Immediate vmvnq is code-generated as a simple vector constant in IR,
and left to the backend to recognize that it can be created with an
MVE VMVN instruction. The predicated version is represented as a
select between the input and the same constant, and I've added a
Tablegen isel rule to turn that into a predicated VMVN. (That should
be better than the previous VMVN + VPSEL: it's the same number of
instructions but now it can fold into an adjacent VPT block.)

The unpredicated forms of VBIC and VORR are done by enabling the same
isel lowering as for NEON, recognizing appropriate immediates and
rewriting them as ARMISD::VBICIMM / ARMISD::VORRIMM SDNodes, which I
then instruction-select into the right MVE instructions (but in custom
C++, because of the awkward MC representation). In order to do that, I
had to promote the Tablegen SDNode instance `NEONvorrImm` to a general
`ARMvorrImm` available in MVE as well, and similarly for `NEONvbicImm`.

For the predicated forms of VBIC and VORR, I've just invented IR
intrinsics. I considered trying to match a call to the existing
predicated VBIC intrinsic where one argument was a vector splat, but
it looked like needing a lot more code than it would give benefit.

This intrinsic family is the first to use the `imm_simd` system I put
into the MveEmitter tablegen backend. So, naturally, it showed up a
bug or two (emitting bogus range checks and the like). Fixed those,
and added a full set of tests for the permissible immediates in the
existing Sema test.

Finally, the new isel support for immediate VBIC has caused changes in
a few existing MVE codegen tests, because `vbic.i16 q0,#0xff00` has
now taken over from `vmovlb.u8 q0,q0` as LLVM's preferred way to clear
the top byte of each 16-bit lane. I think the changes are benign.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D72934

Files:
  clang/include/clang/Basic/arm_mve.td
  clang/include/clang/Basic/arm_mve_defs.td
  clang/include/clang/Sema/Sema.h
  clang/lib/Sema/SemaChecking.cpp
  clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c
  clang/test/Sema/arm-mve-immediates.c
  clang/utils/TableGen/MveEmitter.cpp
  llvm/include/llvm/IR/IntrinsicsARM.td
  llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
  llvm/lib/Target/ARM/ARMISelLowering.cpp
  llvm/lib/Target/ARM/ARMInstrInfo.td
  llvm/lib/Target/ARM/ARMInstrMVE.td
  llvm/lib/Target/ARM/ARMInstrNEON.td
  llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll
  llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll
  llvm/test/CodeGen/Thumb2/mve-masked-load.ll
  llvm/test/CodeGen/Thumb2/mve-sext.ll
  llvm/test/CodeGen/Thumb2/mve-shuffleext.ll

Index: llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
===
--- llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
+++ llvm/test/CodeGen/Thumb2/mve-shuffleext.ll
@@ -73,7 +73,7 @@
 define arm_aapcs_vfpcc <8 x i16> @zext_02468101214(<16 x i8> %src) {
 ; CHECK-LABEL: zext_02468101214:
 ; CHECK:   @ %bb.0: @ %entry
-; CHECK-NEXT:vmovlb.u8 q0, q0
+; CHECK-NEXT:vbic.i16 q0, #0xff00
 ; CHECK-NEXT:bx lr
 entry:
   %strided.vec = shufflevector <16 x i8> %src, <16 x i8> undef, <8 x i32> 
@@ -85,7 +85,7 @@
 ; CHECK-LABEL: zext_13579111315:
 ; CHECK:   @ %bb.0: @ %entry
 ; CHECK-NEXT:vrev16.8 q0, q0
-; CHECK-NEXT:vmovlb.u8 q0, q0
+; CHECK-NEXT:vbic.i16 q0, #0xff00
 ; CHECK-NEXT:bx lr
 entry:
   %strided.vec = shufflevector <16 x i8> %src, <16 x i8> undef, <8 x i32> 
Index: llvm/test/CodeGen/Thumb2/mve-sext.ll
===
--- llvm/test/CodeGen/Thumb2/mve-sext.ll
+++ llvm/test/CodeGen/Thumb2/mve-sext.ll
@@ -277,7 +277,7 @@
 define arm_aapcs_vfpcc <8 x i16> @zext_v8i8_v8i16(<8 x i8> %src) {
 ; CHECK-LABEL: zext_v8i8_v8i16:
 ; CHECK:   @ %bb.0: @ %entry
-; CHECK-NEXT:vmovlb.u8 q0, q0
+; CHECK-NEXT:vbic.i16 q0, #0xff00
 ; CHECK-NEXT:bx lr
 entry:
   %0 = zext <8 x i8> %src to <8 x i16>
@@ -308,41 +308,41 @@
 define arm_aapcs_vfpcc <16 x i16> @zext_v16i8_v16i16(<16 x i8> %src) {
 ; CHECK-LABEL: zext_v16i8_v16i16:
 ; CHECK:   @ %bb.0: @ %entry
+; CHECK-NEXT:vmov q2, q0
 ; CHECK-NEXT:vmov.u8 r0, q0[0]
+; CHECK-NEXT:vmov.16 q0[0], r0
+; CHECK-NEXT:vmov.u8 r0, q2[1]
+; CHECK-NEXT:vmov.16 q0[1], r0
+; CHECK-NEXT:vmov.u8 r0, q2[2]
+; CHECK-NEXT:vmov.16 q0[2], r0
+; CHECK-NEXT:vmov.u8 r0, q2[3]
+; CHECK-NEXT:vmov.16 q0[3], r0
+; CHECK-NEXT:vmov.u8 r0, q2[4]
+; CHECK-NEXT:vmov.16 q0[4], r0
+; CHECK-NEXT:vmov.u8 r0, q2[5]
+; CHECK-NEXT:vmov.16 q0[5], r0
+; CHECK-NEXT:vmov.u8 r0, q2[6]
+; CHECK-NEXT:vmov.16 q0[6], r0
+; CHECK-NEXT:vmov.u8 r0, q2[7]
+; CHECK-NEXT:vmov.16 q0[7], r0
+; CHECK-NEXT:vmov.u8 r0,