[PATCH] D84820: [WebAssembly] Implement prototype v128.load{32,64}_zero instructions

2020-08-03 Thread Thomas Lively via Phabricator via cfe-commits
This revision was landed with ongoing or failed builds.
This revision was automatically updated to reflect the committed changes.
Closed by commit rGcb327922101b: [WebAssembly] Implement prototype 
v128.load{32,64}_zero instructions (authored by tlively).

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D84820/new/

https://reviews.llvm.org/D84820

Files:
  clang/include/clang/Basic/BuiltinsWebAssembly.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-wasm.c
  llvm/include/llvm/IR/IntrinsicsWebAssembly.td
  llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
  llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
  llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
  llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
  llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
  llvm/test/MC/WebAssembly/simd-encodings.s

Index: llvm/test/MC/WebAssembly/simd-encodings.s
===
--- llvm/test/MC/WebAssembly/simd-encodings.s
+++ llvm/test/MC/WebAssembly/simd-encodings.s
@@ -463,9 +463,6 @@
 # CHECK: i32x4.sub # encoding: [0xfd,0xb1,0x01]
 i32x4.sub
 
-# CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xb4,0x01]
-i32x4.dot_i16x8_s
-
 # CHECK: i32x4.mul # encoding: [0xfd,0xb5,0x01]
 i32x4.mul
 
@@ -481,6 +478,9 @@
 # CHECK: i32x4.max_u # encoding: [0xfd,0xb9,0x01]
 i32x4.max_u
 
+# CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xba,0x01]
+i32x4.dot_i16x8_s
+
 # CHECK: i64x2.neg # encoding: [0xfd,0xc1,0x01]
 i64x2.neg
 
@@ -610,10 +610,16 @@
 # CHECK: f32x4.convert_i32x4_u # encoding: [0xfd,0xfb,0x01]
 f32x4.convert_i32x4_u
 
-# CHECK: f32x4.qfma # encoding: [0xfd,0xfc,0x01]
+# CHECK: v128.load32_zero 32 # encoding: [0xfd,0xfc,0x01,0x02,0x20]
+v128.load32_zero 32
+
+# CHECK: v128.load64_zero 32 # encoding: [0xfd,0xfd,0x01,0x03,0x20]
+v128.load64_zero 32
+
+# CHECK: f32x4.qfma # encoding: [0xfd,0xb4,0x01]
 f32x4.qfma
 
-# CHECK: f32x4.qfms # encoding: [0xfd,0xfd,0x01]
+# CHECK: f32x4.qfms # encoding: [0xfd,0xd4,0x01]
 f32x4.qfms
 
 # CHECK: f64x2.qfma # encoding: [0xfd,0xfe,0x01]
Index: llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
===
--- /dev/null
+++ llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Test SIMD v128.load{32,64}_zero instructions
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare <4 x i32> @llvm.wasm.load32.zero(i32*)
+declare <2 x i64> @llvm.wasm.load64.zero(i64*)
+
+;===
+; v128.load32_zero
+;===
+
+define <4 x i32> @load_zero_i32_no_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_no_offset:
+; CHECK: .functype load_zero_i32_no_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 0
+; CHECK-NEXT:# fallthrough-return
+  %v = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %p)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @load_zero_i32_with_folded_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_offset:
+; CHECK: .functype load_zero_i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 24
+; CHECK-NEXT:# fallthrough-return
+  %q = ptrtoint i32* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_folded_gep_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_gep_offset:
+; CHECK: .functype load_zero_i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 24
+; CHECK-NEXT:# fallthrough-return
+  %s = getelementptr inbounds i32, i32* %p, i32 6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_gep_negative_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_gep_negative_offset:
+; CHECK: .functype load_zero_i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:i32.const -24
+; CHECK-NEXT:i32.add
+; CHECK-NEXT:v128.load32_zero 0
+; CHECK-NEXT:# fallthrough-return
+  %s = getelementptr inbounds i32, i32* %p, i32 -6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> 

[PATCH] D84820: [WebAssembly] Implement prototype v128.load{32,64}_zero instructions

2020-08-03 Thread Thomas Lively via Phabricator via cfe-commits
tlively updated this revision to Diff 282680.
tlively added a comment.

- Renumber i32x4.dot_i16x8_s to match V8 as well


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D84820/new/

https://reviews.llvm.org/D84820

Files:
  clang/include/clang/Basic/BuiltinsWebAssembly.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-wasm.c
  llvm/include/llvm/IR/IntrinsicsWebAssembly.td
  llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
  llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
  llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
  llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
  llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
  llvm/test/MC/WebAssembly/simd-encodings.s

Index: llvm/test/MC/WebAssembly/simd-encodings.s
===
--- llvm/test/MC/WebAssembly/simd-encodings.s
+++ llvm/test/MC/WebAssembly/simd-encodings.s
@@ -463,9 +463,6 @@
 # CHECK: i32x4.sub # encoding: [0xfd,0xb1,0x01]
 i32x4.sub
 
-# CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xb4,0x01]
-i32x4.dot_i16x8_s
-
 # CHECK: i32x4.mul # encoding: [0xfd,0xb5,0x01]
 i32x4.mul
 
@@ -481,6 +478,9 @@
 # CHECK: i32x4.max_u # encoding: [0xfd,0xb9,0x01]
 i32x4.max_u
 
+# CHECK: i32x4.dot_i16x8_s # encoding: [0xfd,0xba,0x01]
+i32x4.dot_i16x8_s
+
 # CHECK: i64x2.neg # encoding: [0xfd,0xc1,0x01]
 i64x2.neg
 
@@ -610,10 +610,16 @@
 # CHECK: f32x4.convert_i32x4_u # encoding: [0xfd,0xfb,0x01]
 f32x4.convert_i32x4_u
 
-# CHECK: f32x4.qfma # encoding: [0xfd,0xfc,0x01]
+# CHECK: v128.load32_zero 32 # encoding: [0xfd,0xfc,0x01,0x02,0x20]
+v128.load32_zero 32
+
+# CHECK: v128.load64_zero 32 # encoding: [0xfd,0xfd,0x01,0x03,0x20]
+v128.load64_zero 32
+
+# CHECK: f32x4.qfma # encoding: [0xfd,0xb4,0x01]
 f32x4.qfma
 
-# CHECK: f32x4.qfms # encoding: [0xfd,0xfd,0x01]
+# CHECK: f32x4.qfms # encoding: [0xfd,0xd4,0x01]
 f32x4.qfms
 
 # CHECK: f64x2.qfma # encoding: [0xfd,0xfe,0x01]
Index: llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
===
--- /dev/null
+++ llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Test SIMD v128.load{32,64}_zero instructions
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare <4 x i32> @llvm.wasm.load32.zero(i32*)
+declare <2 x i64> @llvm.wasm.load64.zero(i64*)
+
+;===
+; v128.load32_zero
+;===
+
+define <4 x i32> @load_zero_i32_no_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_no_offset:
+; CHECK: .functype load_zero_i32_no_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 0
+; CHECK-NEXT:# fallthrough-return
+  %v = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %p)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @load_zero_i32_with_folded_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_offset:
+; CHECK: .functype load_zero_i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 24
+; CHECK-NEXT:# fallthrough-return
+  %q = ptrtoint i32* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_folded_gep_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_gep_offset:
+; CHECK: .functype load_zero_i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 24
+; CHECK-NEXT:# fallthrough-return
+  %s = getelementptr inbounds i32, i32* %p, i32 6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_gep_negative_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_gep_negative_offset:
+; CHECK: .functype load_zero_i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:i32.const -24
+; CHECK-NEXT:i32.add
+; CHECK-NEXT:v128.load32_zero 0
+; CHECK-NEXT:# fallthrough-return
+  %s = getelementptr inbounds i32, i32* %p, i32 -6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_offset:
+; CHECK: .functype load_zero_i32_with_unfolded_offset 

[PATCH] D84820: [WebAssembly] Implement prototype v128.load{32,64}_zero instructions

2020-07-30 Thread Thomas Lively via Phabricator via cfe-commits
tlively added inline comments.



Comment at: llvm/include/llvm/IR/IntrinsicsWebAssembly.td:198
+[LLVMPointerType],
+[IntrReadMem, IntrArgMemOnly, IntrSpeculatable],
+ "", [SDNPMemOperand]>;

aheejin wrote:
> Can memory accesses be speculatable? The below too
Hmm, maybe not, and it's definitely move conservative for them not to be.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D84820/new/

https://reviews.llvm.org/D84820

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D84820: [WebAssembly] Implement prototype v128.load{32,64}_zero instructions

2020-07-30 Thread Thomas Lively via Phabricator via cfe-commits
tlively updated this revision to Diff 282017.
tlively added a comment.

- Remove IntrSpeculatable


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D84820/new/

https://reviews.llvm.org/D84820

Files:
  clang/include/clang/Basic/BuiltinsWebAssembly.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-wasm.c
  llvm/include/llvm/IR/IntrinsicsWebAssembly.td
  llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
  llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
  llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
  llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
  llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
  llvm/test/MC/WebAssembly/simd-encodings.s

Index: llvm/test/MC/WebAssembly/simd-encodings.s
===
--- llvm/test/MC/WebAssembly/simd-encodings.s
+++ llvm/test/MC/WebAssembly/simd-encodings.s
@@ -610,10 +610,16 @@
 # CHECK: f32x4.convert_i32x4_u # encoding: [0xfd,0xfb,0x01]
 f32x4.convert_i32x4_u
 
-# CHECK: f32x4.qfma # encoding: [0xfd,0xfc,0x01]
+# CHECK: v128.load32_zero 32 # encoding: [0xfd,0xfc,0x01,0x02,0x20]
+v128.load32_zero 32
+
+# CHECK: v128.load64_zero 32 # encoding: [0xfd,0xfd,0x01,0x03,0x20]
+v128.load64_zero 32
+
+# CHECK: f32x4.qfma # encoding: [0xfd,0xb4,0x01]
 f32x4.qfma
 
-# CHECK: f32x4.qfms # encoding: [0xfd,0xfd,0x01]
+# CHECK: f32x4.qfms # encoding: [0xfd,0xd4,0x01]
 f32x4.qfms
 
 # CHECK: f64x2.qfma # encoding: [0xfd,0xfe,0x01]
Index: llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
===
--- /dev/null
+++ llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Test SIMD v128.load{32,64}_zero instructions
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare <4 x i32> @llvm.wasm.load32.zero(i32*)
+declare <2 x i64> @llvm.wasm.load64.zero(i64*)
+
+;===
+; v128.load32_zero
+;===
+
+define <4 x i32> @load_zero_i32_no_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_no_offset:
+; CHECK: .functype load_zero_i32_no_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 0
+; CHECK-NEXT:# fallthrough-return
+  %v = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %p)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @load_zero_i32_with_folded_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_offset:
+; CHECK: .functype load_zero_i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 24
+; CHECK-NEXT:# fallthrough-return
+  %q = ptrtoint i32* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_folded_gep_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_gep_offset:
+; CHECK: .functype load_zero_i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 24
+; CHECK-NEXT:# fallthrough-return
+  %s = getelementptr inbounds i32, i32* %p, i32 6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_gep_negative_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_gep_negative_offset:
+; CHECK: .functype load_zero_i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:i32.const -24
+; CHECK-NEXT:i32.add
+; CHECK-NEXT:v128.load32_zero 0
+; CHECK-NEXT:# fallthrough-return
+  %s = getelementptr inbounds i32, i32* %p, i32 -6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_offset:
+; CHECK: .functype load_zero_i32_with_unfolded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:i32.const 24
+; CHECK-NEXT:i32.add
+; CHECK-NEXT:v128.load32_zero 0
+; CHECK-NEXT:# fallthrough-return
+  %q = ptrtoint i32* %p to i32
+  %r = add nsw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_gep_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_gep_offset:
+; CHECK: 

[PATCH] D84820: [WebAssembly] Implement prototype v128.load{32,64}_zero instructions

2020-07-30 Thread Heejin Ahn via Phabricator via cfe-commits
aheejin accepted this revision.
aheejin added inline comments.
This revision is now accepted and ready to land.



Comment at: llvm/include/llvm/IR/IntrinsicsWebAssembly.td:198
+[LLVMPointerType],
+[IntrReadMem, IntrArgMemOnly, IntrSpeculatable],
+ "", [SDNPMemOperand]>;

Can memory accesses be speculatable? The below too


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D84820/new/

https://reviews.llvm.org/D84820

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D84820: [WebAssembly] Implement prototype v128.load{32,64}_zero instructions

2020-07-29 Thread Thomas Lively via Phabricator via cfe-commits
tlively added a comment.

Since this changes opcodes, it needs to be landed in concert with the 
corresponding Binaryen change.


Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D84820/new/

https://reviews.llvm.org/D84820

___
cfe-commits mailing list
cfe-commits@lists.llvm.org
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits


[PATCH] D84820: [WebAssembly] Implement prototype v128.load{32,64}_zero instructions

2020-07-28 Thread Thomas Lively via Phabricator via cfe-commits
tlively created this revision.
tlively added a reviewer: aheejin.
Herald added subscribers: llvm-commits, cfe-commits, sunfish, hiraditya, 
jgravelle-google, sbc100, dschuff.
Herald added projects: clang, LLVM.
tlively requested review of this revision.

Specified in https://github.com/WebAssembly/simd/pull/237, these
instructions load the first vector lane from memory and zero the other
lanes. Since these instructions are not officially part of the SIMD
proposal, they are only available on an opt-in basis via LLVM
intrinsics and clang builtin functions. If these instructions are
merged to the proposal, this implementation will change so that the
instructions will be generated from normal IR. At that point the
intrinsics and builtin functions would be removed.

This PR also changes the opcodes for the experimental f32x4.qfm{a,s}
instructions because their opcodes conflicted with those of the
v128.load{32,64}_zero instructions. The new opcodes were chosen to
match those used in V8.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D84820

Files:
  clang/include/clang/Basic/BuiltinsWebAssembly.def
  clang/lib/CodeGen/CGBuiltin.cpp
  clang/test/CodeGen/builtins-wasm.c
  llvm/include/llvm/IR/IntrinsicsWebAssembly.td
  llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
  llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
  llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
  llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
  llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
  llvm/test/MC/WebAssembly/simd-encodings.s

Index: llvm/test/MC/WebAssembly/simd-encodings.s
===
--- llvm/test/MC/WebAssembly/simd-encodings.s
+++ llvm/test/MC/WebAssembly/simd-encodings.s
@@ -610,10 +610,16 @@
 # CHECK: f32x4.convert_i32x4_u # encoding: [0xfd,0xfb,0x01]
 f32x4.convert_i32x4_u
 
-# CHECK: f32x4.qfma # encoding: [0xfd,0xfc,0x01]
+# CHECK: v128.load32_zero 32 # encoding: [0xfd,0xfc,0x01,0x02,0x20]
+v128.load32_zero 32
+
+# CHECK: v128.load64_zero 32 # encoding: [0xfd,0xfd,0x01,0x03,0x20]
+v128.load64_zero 32
+
+# CHECK: f32x4.qfma # encoding: [0xfd,0xb4,0x01]
 f32x4.qfma
 
-# CHECK: f32x4.qfms # encoding: [0xfd,0xfd,0x01]
+# CHECK: f32x4.qfms # encoding: [0xfd,0xd4,0x01]
 f32x4.qfms
 
 # CHECK: f64x2.qfma # encoding: [0xfd,0xfe,0x01]
Index: llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
===
--- /dev/null
+++ llvm/test/CodeGen/WebAssembly/simd-load-zero-offset.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+; Test SIMD v128.load{32,64}_zero instructions
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+declare <4 x i32> @llvm.wasm.load32.zero(i32*)
+declare <2 x i64> @llvm.wasm.load64.zero(i64*)
+
+;===
+; v128.load32_zero
+;===
+
+define <4 x i32> @load_zero_i32_no_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_no_offset:
+; CHECK: .functype load_zero_i32_no_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 0
+; CHECK-NEXT:# fallthrough-return
+  %v = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %p)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @load_zero_i32_with_folded_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_offset:
+; CHECK: .functype load_zero_i32_with_folded_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 24
+; CHECK-NEXT:# fallthrough-return
+  %q = ptrtoint i32* %p to i32
+  %r = add nuw i32 %q, 24
+  %s = inttoptr i32 %r to i32*
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_folded_gep_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_folded_gep_offset:
+; CHECK: .functype load_zero_i32_with_folded_gep_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:v128.load32_zero 24
+; CHECK-NEXT:# fallthrough-return
+  %s = getelementptr inbounds i32, i32* %p, i32 6
+  %t = tail call <4 x i32> @llvm.wasm.load32.zero(i32* %s)
+  ret <4 x i32> %t
+}
+
+define <4 x i32> @load_zero_i32_with_unfolded_gep_negative_offset(i32* %p) {
+; CHECK-LABEL: load_zero_i32_with_unfolded_gep_negative_offset:
+; CHECK: .functype load_zero_i32_with_unfolded_gep_negative_offset (i32) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:local.get 0
+; CHECK-NEXT:i32.const -24
+; CHECK-NEXT:i32.add
+; CHECK-NEXT:v128.load32_zero 0
+; CHECK-NEXT:# fallthrough-return
+