Re: [Mesa-dev] [PATCH] R600/SI: Split global vector loads with more than 4 elements

2014-02-11 Thread Tom Stellard
On Mon, Feb 10, 2014 at 02:35:03PM -0800, Matt Arsenault wrote:
 Why would you want to do this for the small types? You should be able to 
 load those in fewer loads and then promote them.


We only custom lower v*i32 loads, so this code won't be executed on
smaller types.

-Tom

 On 02/10/2014 01:32 PM, Tom Stellard wrote:
  From: Tom Stellard thomas.stell...@amd.com
 
  ---
lib/Target/R600/SIISelLowering.cpp |   8 +-
test/CodeGen/R600/load.ll  | 178 
  +++--
2 files changed, 98 insertions(+), 88 deletions(-)
 
  diff --git a/lib/Target/R600/SIISelLowering.cpp 
  b/lib/Target/R600/SIISelLowering.cpp
  index 9537405..eb08a13 100644
  --- a/lib/Target/R600/SIISelLowering.cpp
  +++ b/lib/Target/R600/SIISelLowering.cpp
  @@ -478,9 +478,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
  SelectionDAG DAG) const {
  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
  case ISD::LOAD: {
LoadSDNode *Load = dyn_castLoadSDNode(Op);
  -if ((Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
  - Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) 
  -Op.getValueType().isVector()) {
  +if (Op.getValueType().isVector() 
  +(Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
  + Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
  + (Load-getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS 
  +  Op.getValueType().getVectorNumElements()  4))) {
  SDValue MergedValues[2] = {
SplitVectorLoad(Op, DAG),
Load-getChain()
  diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
  index 0153524..1486c4d 100644
  --- a/test/CodeGen/R600/load.ll
  +++ b/test/CodeGen/R600/load.ll
  @@ -1,16 +1,15 @@
  -; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
  --check-prefix=R600-CHECK %s
  -; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck 
  --check-prefix=R600-CHECK %s
  -; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
  --check-prefix=SI-CHECK  %s
  +; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
  --check-prefix=R600-CHECK --check-prefix=FUNC %s
  +; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck 
  --check-prefix=R600-CHECK --check-prefix=FUNC %s
  +; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
  --check-prefix=SI-CHECK --check-prefix=FUNC %s


  ;======;
; GLOBAL ADDRESS SPACE

  ;======;

; Load an i8 value from the global address space.
  -; R600-CHECK-LABEL: @load_i8
  +; FUNC-LABEL: @load_i8
; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}

  -; SI-CHECK-LABEL: @load_i8
; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
  %1 = load i8 addrspace(1)* %in
  @@ -19,13 +18,12 @@ define void @load_i8(i32 addrspace(1)* %out, i8 
  addrspace(1)* %in) {
  ret void
}

  -; R600-CHECK-LABEL: @load_i8_sext
  +; FUNC-LABEL: @load_i8_sext
; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
; R600-CHECK: 24
; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
; R600-CHECK: 24
  -; SI-CHECK-LABEL: @load_i8_sext
; SI-CHECK: BUFFER_LOAD_SBYTE
define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
entry:
  @@ -35,10 +33,9 @@ entry:
  ret void
}

  -; R600-CHECK-LABEL: @load_v2i8
  +; FUNC-LABEL: @load_v2i8
; R600-CHECK: VTX_READ_8
; R600-CHECK: VTX_READ_8
  -; SI-CHECK-LABEL: @load_v2i8
; SI-CHECK: BUFFER_LOAD_UBYTE
; SI-CHECK: BUFFER_LOAD_UBYTE
define void @load_v2i8(2 x i32 addrspace(1)* %out, 2 x i8 
  addrspace(1)* %in) {
  @@ -49,7 +46,7 @@ entry:
  ret void
}

  -; R600-CHECK-LABEL: @load_v2i8_sext
  +; FUNC-LABEL: @load_v2i8_sext
; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], 
  [[DST_X]]
  @@ -60,7 +57,6 @@ entry:
; R600-CHECK-DAG: 24
; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
; R600-CHECK-DAG: 24
  -; SI-CHECK-LABEL: @load_v2i8_sext
; SI-CHECK: BUFFER_LOAD_SBYTE
; SI-CHECK: BUFFER_LOAD_SBYTE
define void @load_v2i8_sext(2 x i32 addrspace(1)* %out, 2 x i8 
  addrspace(1)* %in) {
  @@ -71,12 +67,11 @@ entry:
  ret void
}

  -; R600-CHECK-LABEL: @load_v4i8
  +; FUNC-LABEL: @load_v4i8
; R600-CHECK: VTX_READ_8
; R600-CHECK: VTX_READ_8
; R600-CHECK: VTX_READ_8
; R600-CHECK: VTX_READ_8
  -; SI-CHECK-LABEL: @load_v4i8
; SI-CHECK: BUFFER_LOAD_UBYTE
; SI-CHECK: BUFFER_LOAD_UBYTE
; SI-CHECK: BUFFER_LOAD_UBYTE
  @@ -89,7 +84,7 

[Mesa-dev] [PATCH] R600/SI: Split global vector loads with more than 4 elements

2014-02-10 Thread Tom Stellard
From: Tom Stellard thomas.stell...@amd.com

---
 lib/Target/R600/SIISelLowering.cpp |   8 +-
 test/CodeGen/R600/load.ll  | 178 +++--
 2 files changed, 98 insertions(+), 88 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp 
b/lib/Target/R600/SIISelLowering.cpp
index 9537405..eb08a13 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -478,9 +478,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG DAG) const {
   case ISD::BRCOND: return LowerBRCOND(Op, DAG);
   case ISD::LOAD: {
 LoadSDNode *Load = dyn_castLoadSDNode(Op);
-if ((Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
- Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) 
-Op.getValueType().isVector()) {
+if (Op.getValueType().isVector() 
+(Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
+ (Load-getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS 
+  Op.getValueType().getVectorNumElements()  4))) {
   SDValue MergedValues[2] = {
 SplitVectorLoad(Op, DAG),
 Load-getChain()
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index 0153524..1486c4d 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -1,16 +1,15 @@
-; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
--check-prefix=R600-CHECK %s
-; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK 
%s
-; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
--check-prefix=SI-CHECK  %s
+; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
--check-prefix=R600-CHECK --check-prefix=FUNC %s
+; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK 
--check-prefix=FUNC %s
+; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
--check-prefix=SI-CHECK --check-prefix=FUNC %s
 
 
;======;
 ; GLOBAL ADDRESS SPACE
 
;======;
 
 ; Load an i8 value from the global address space.
-; R600-CHECK-LABEL: @load_i8
+; FUNC-LABEL: @load_i8
 ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
 
-; SI-CHECK-LABEL: @load_i8
 ; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
 define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8 addrspace(1)* %in
@@ -19,13 +18,12 @@ define void @load_i8(i32 addrspace(1)* %out, i8 
addrspace(1)* %in) {
   ret void
 }
 
-; R600-CHECK-LABEL: @load_i8_sext
+; FUNC-LABEL: @load_i8_sext
 ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
 ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
 ; R600-CHECK: 24
 ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
 ; R600-CHECK: 24
-; SI-CHECK-LABEL: @load_i8_sext
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
@@ -35,10 +33,9 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i8
+; FUNC-LABEL: @load_v2i8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @load_v2i8
 ; SI-CHECK: BUFFER_LOAD_UBYTE
 ; SI-CHECK: BUFFER_LOAD_UBYTE
 define void @load_v2i8(2 x i32 addrspace(1)* %out, 2 x i8 addrspace(1)* 
%in) {
@@ -49,7 +46,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v2i8_sext
+; FUNC-LABEL: @load_v2i8_sext
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
@@ -60,7 +57,6 @@ entry:
 ; R600-CHECK-DAG: 24
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
 ; R600-CHECK-DAG: 24
-; SI-CHECK-LABEL: @load_v2i8_sext
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 define void @load_v2i8_sext(2 x i32 addrspace(1)* %out, 2 x i8 
addrspace(1)* %in) {
@@ -71,12 +67,11 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i8
+; FUNC-LABEL: @load_v4i8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
 ; R600-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @load_v4i8
 ; SI-CHECK: BUFFER_LOAD_UBYTE
 ; SI-CHECK: BUFFER_LOAD_UBYTE
 ; SI-CHECK: BUFFER_LOAD_UBYTE
@@ -89,7 +84,7 @@ entry:
   ret void
 }
 
-; R600-CHECK-LABEL: @load_v4i8_sext
+; FUNC-LABEL: @load_v4i8_sext
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
 ; R600-CHECK-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
@@ -110,7 +105,6 @@ entry:
 ; R600-CHECK-DAG: 24
 ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_W_CHAN]]
 ; R600-CHECK-DAG: 24
-; SI-CHECK-LABEL: @load_v4i8_sext
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 ; SI-CHECK: BUFFER_LOAD_SBYTE
 ; SI-CHECK: BUFFER_LOAD_SBYTE
@@ -124,9 +118,8 @@ entry:
 }
 
 ; Load an i16 value from the global 

Re: [Mesa-dev] [PATCH] R600/SI: Split global vector loads with more than 4 elements

2014-02-10 Thread Matt Arsenault
Why would you want to do this for the small types? You should be able to 
load those in fewer loads and then promote them.


On 02/10/2014 01:32 PM, Tom Stellard wrote:

From: Tom Stellard thomas.stell...@amd.com

---
  lib/Target/R600/SIISelLowering.cpp |   8 +-
  test/CodeGen/R600/load.ll  | 178 +++--
  2 files changed, 98 insertions(+), 88 deletions(-)

diff --git a/lib/Target/R600/SIISelLowering.cpp 
b/lib/Target/R600/SIISelLowering.cpp
index 9537405..eb08a13 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -478,9 +478,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
SelectionDAG DAG) const {
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
  LoadSDNode *Load = dyn_castLoadSDNode(Op);
-if ((Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
- Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) 
-Op.getValueType().isVector()) {
+if (Op.getValueType().isVector() 
+(Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
+ Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
+ (Load-getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS 
+  Op.getValueType().getVectorNumElements()  4))) {
SDValue MergedValues[2] = {
  SplitVectorLoad(Op, DAG),
  Load-getChain()
diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
index 0153524..1486c4d 100644
--- a/test/CodeGen/R600/load.ll
+++ b/test/CodeGen/R600/load.ll
@@ -1,16 +1,15 @@
-; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
--check-prefix=R600-CHECK %s
-; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK 
%s
-; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
--check-prefix=SI-CHECK  %s
+; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
--check-prefix=R600-CHECK --check-prefix=FUNC %s
+; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck --check-prefix=R600-CHECK 
--check-prefix=FUNC %s
+; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
--check-prefix=SI-CHECK --check-prefix=FUNC %s
  
  ;======;

  ; GLOBAL ADDRESS SPACE
  
;======;
  
  ; Load an i8 value from the global address space.

-; R600-CHECK-LABEL: @load_i8
+; FUNC-LABEL: @load_i8
  ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
  
-; SI-CHECK-LABEL: @load_i8

  ; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
  define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
%1 = load i8 addrspace(1)* %in
@@ -19,13 +18,12 @@ define void @load_i8(i32 addrspace(1)* %out, i8 
addrspace(1)* %in) {
ret void
  }
  
-; R600-CHECK-LABEL: @load_i8_sext

+; FUNC-LABEL: @load_i8_sext
  ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
  ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
  ; R600-CHECK: 24
  ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
  ; R600-CHECK: 24
-; SI-CHECK-LABEL: @load_i8_sext
  ; SI-CHECK: BUFFER_LOAD_SBYTE
  define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
  entry:
@@ -35,10 +33,9 @@ entry:
ret void
  }
  
-; R600-CHECK-LABEL: @load_v2i8

+; FUNC-LABEL: @load_v2i8
  ; R600-CHECK: VTX_READ_8
  ; R600-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @load_v2i8
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  define void @load_v2i8(2 x i32 addrspace(1)* %out, 2 x i8 addrspace(1)* 
%in) {
@@ -49,7 +46,7 @@ entry:
ret void
  }
  
-; R600-CHECK-LABEL: @load_v2i8_sext

+; FUNC-LABEL: @load_v2i8_sext
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
  ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
@@ -60,7 +57,6 @@ entry:
  ; R600-CHECK-DAG: 24
  ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
  ; R600-CHECK-DAG: 24
-; SI-CHECK-LABEL: @load_v2i8_sext
  ; SI-CHECK: BUFFER_LOAD_SBYTE
  ; SI-CHECK: BUFFER_LOAD_SBYTE
  define void @load_v2i8_sext(2 x i32 addrspace(1)* %out, 2 x i8 
addrspace(1)* %in) {
@@ -71,12 +67,11 @@ entry:
ret void
  }
  
-; R600-CHECK-LABEL: @load_v4i8

+; FUNC-LABEL: @load_v4i8
  ; R600-CHECK: VTX_READ_8
  ; R600-CHECK: VTX_READ_8
  ; R600-CHECK: VTX_READ_8
  ; R600-CHECK: VTX_READ_8
-; SI-CHECK-LABEL: @load_v4i8
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  ; SI-CHECK: BUFFER_LOAD_UBYTE
@@ -89,7 +84,7 @@ entry:
ret void
  }
  
-; R600-CHECK-LABEL: @load_v4i8_sext

+; FUNC-LABEL: @load_v4i8_sext
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_Z:T[0-9]\.[XYZW]]], [[DST_Z]]
@@ -110,7 +105,6 @@ entry:
  ; R600-CHECK-DAG: 24
  ; R600-CHECK-DAG: ASHR {{[* 

Re: [Mesa-dev] [PATCH] R600/SI: Split global vector loads with more than 4 elements

2014-02-10 Thread Aaron Watry
Hi Tom,

This definitely fixes some issues that I've been seeing with int8/16
vload8() and vload16() in CL.  vstore8/vstore16 are still broken, but
at least the loads are working now (I've only tested int, but I can
give a full test run if you want/need).

For reference, the tests that failed before were in:
piglit/test/cl/program/execute/vload-int.cl

comment out the vload3 tests to get the rest to work. Previously int3,
int8, and int16 all failed, now just the int3 tests fail to build.

If you're curious, the vstore issues start like the following and can
be reproduced with the attached test case:
LLVM ERROR: Cannot select: 0x12fa1b0: v4i32 = extract_subvector
0x1c55240, 0x12f8ba0 [ORD=28] [ID=38]
  0x1c55240: v8i32 = BUILD_VECTOR 0x1c55140, 0x12f7a90, 0x12f7f90,
0x12f8490, 0x12f8aa0, 0x12f8fa0, 0x12f94a0, 0x12f9ab0 [ORD=24] [ID=35]

--Aaron

On Mon, Feb 10, 2014 at 3:32 PM, Tom Stellard t...@stellard.net wrote:
 From: Tom Stellard thomas.stell...@amd.com

 ---
  lib/Target/R600/SIISelLowering.cpp |   8 +-
  test/CodeGen/R600/load.ll  | 178 
 +++--
  2 files changed, 98 insertions(+), 88 deletions(-)

 diff --git a/lib/Target/R600/SIISelLowering.cpp 
 b/lib/Target/R600/SIISelLowering.cpp
 index 9537405..eb08a13 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -478,9 +478,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, 
 SelectionDAG DAG) const {
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
case ISD::LOAD: {
  LoadSDNode *Load = dyn_castLoadSDNode(Op);
 -if ((Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
 - Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) 
 -Op.getValueType().isVector()) {
 +if (Op.getValueType().isVector() 
 +(Load-getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
 + Load-getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
 + (Load-getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS 
 +  Op.getValueType().getVectorNumElements()  4))) {
SDValue MergedValues[2] = {
  SplitVectorLoad(Op, DAG),
  Load-getChain()
 diff --git a/test/CodeGen/R600/load.ll b/test/CodeGen/R600/load.ll
 index 0153524..1486c4d 100644
 --- a/test/CodeGen/R600/load.ll
 +++ b/test/CodeGen/R600/load.ll
 @@ -1,16 +1,15 @@
 -; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
 --check-prefix=R600-CHECK %s
 -; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck 
 --check-prefix=R600-CHECK %s
 -; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
 --check-prefix=SI-CHECK  %s
 +; RUN: llc  %s -march=r600 -mcpu=redwood | FileCheck 
 --check-prefix=R600-CHECK --check-prefix=FUNC %s
 +; RUN: llc  %s -march=r600 -mcpu=cayman | FileCheck 
 --check-prefix=R600-CHECK --check-prefix=FUNC %s
 +; RUN: llc  %s -march=r600 -mcpu=SI -verify-machineinstrs | FileCheck 
 --check-prefix=SI-CHECK --check-prefix=FUNC %s

  
 ;======;
  ; GLOBAL ADDRESS SPACE
  
 ;======;

  ; Load an i8 value from the global address space.
 -; R600-CHECK-LABEL: @load_i8
 +; FUNC-LABEL: @load_i8
  ; R600-CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}

 -; SI-CHECK-LABEL: @load_i8
  ; SI-CHECK: BUFFER_LOAD_UBYTE v{{[0-9]+}},
  define void @load_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
%1 = load i8 addrspace(1)* %in
 @@ -19,13 +18,12 @@ define void @load_i8(i32 addrspace(1)* %out, i8 
 addrspace(1)* %in) {
ret void
  }

 -; R600-CHECK-LABEL: @load_i8_sext
 +; FUNC-LABEL: @load_i8_sext
  ; R600-CHECK: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]]
  ; R600-CHECK: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_CHAN:[XYZW]]], [[DST]]
  ; R600-CHECK: 24
  ; R600-CHECK: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_CHAN]]
  ; R600-CHECK: 24
 -; SI-CHECK-LABEL: @load_i8_sext
  ; SI-CHECK: BUFFER_LOAD_SBYTE
  define void @load_i8_sext(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
  entry:
 @@ -35,10 +33,9 @@ entry:
ret void
  }

 -; R600-CHECK-LABEL: @load_v2i8
 +; FUNC-LABEL: @load_v2i8
  ; R600-CHECK: VTX_READ_8
  ; R600-CHECK: VTX_READ_8
 -; SI-CHECK-LABEL: @load_v2i8
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  ; SI-CHECK: BUFFER_LOAD_UBYTE
  define void @load_v2i8(2 x i32 addrspace(1)* %out, 2 x i8 addrspace(1)* 
 %in) {
 @@ -49,7 +46,7 @@ entry:
ret void
  }

 -; R600-CHECK-LABEL: @load_v2i8_sext
 +; FUNC-LABEL: @load_v2i8_sext
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_X:T[0-9]\.[XYZW]]], [[DST_X]]
  ; R600-CHECK-DAG: VTX_READ_8 [[DST_Y:T[0-9]\.[XYZW]]], [[DST_Y]]
  ; R600-CHECK-DAG: LSHL {{[* ]*}}T{{[0-9]}}.[[LSHL_X_CHAN:[XYZW]]], [[DST_X]]
 @@ -60,7 +57,6 @@ entry:
  ; R600-CHECK-DAG: 24
  ; R600-CHECK-DAG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, PV.[[LSHL_Y_CHAN]]
  ; R600-CHECK-DAG: 24
 -; SI-CHECK-LABEL: @load_v2i8_sext
  ; SI-CHECK: BUFFER_LOAD_SBYTE
  ; SI-CHECK: BUFFER_LOAD_SBYTE
  define void @load_v2i8_sext(2 x i32