From: Pan Xiuli <xiuli....@intel.com>

Add intel subgroup short mem bleck read/write and image block read/write
also fix some old block read/write bug.
Refine old uint block read/write with _ui suffix.

Signed-off-by: Pan Xiuli <xiuli....@intel.com>
---
 backend/src/backend/gen_context.cpp        | 190 +++++++++++++++++--------
 backend/src/backend/gen_encoder.cpp        |  26 +++-
 backend/src/backend/gen_insn_selection.cpp |  37 +++--
 backend/src/ir/instruction.cpp             |  26 ++--
 backend/src/ir/instruction.hpp             |   6 +-
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl   | 221 ++++++++++++++++++++++++-----
 backend/src/libocl/tmpl/ocl_simd.tmpl.h    |  48 ++++++-
 backend/src/llvm/llvm_gen_backend.cpp      | 125 +++++++++++-----
 backend/src/llvm/llvm_gen_ocl_function.hxx |  50 ++++---
 backend/src/llvm/llvm_scalarize.cpp        |  42 ++++--
 10 files changed, 573 insertions(+), 198 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index a1ae5ea..6bb0f22 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3501,12 +3501,14 @@ namespace gbe
   }
 
   void GenContext::emitOBReadInstruction(const SelectionInstruction &insn) {
-    const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)), 
GEN_TYPE_UD);
+    const GenRegister dst= ra->genReg(insn.dst(1));
+    uint32_t type = dst.type;
+    uint32_t typesize = typeSize(type);
     const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
     const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), 
GEN_TYPE_UD);
     const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
     const uint32_t vec_size = insn.extra.elem;
-    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + 
vec_size)), GEN_TYPE_UD);
+    const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + 
vec_size)), type);
     const uint32_t simdWidth = p->curr.execWidth;
 
     // Make header
@@ -3532,7 +3534,7 @@ namespace gbe
       {
         p->curr.execWidth = 16;
         p->curr.noMask = 1;
-        p->OBREAD(dst, header, insn.getbti(), simdWidth / 4);
+        p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
       }
       p->pop();
     } else if (vec_size == 2) {
@@ -3540,14 +3542,41 @@ namespace gbe
       {
         p->curr.execWidth = 16;
         p->curr.noMask = 1;
-        p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2);
+        p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
       }
       p->pop();
       p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
-      p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth / 8));
-    } else if (vec_size == 4 || vec_size == 8) {
+      p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * 
typesize ));
+    } else if (vec_size == 4) {
       if (simdWidth == 8) {
-        for (uint32_t i = 0; i < vec_size / 4; i++) {
+        p->push();
+        {
+          p->curr.execWidth = 16;
+          p->curr.noMask = 1;
+          p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
+        }
+        p->pop();
+        for (uint32_t j = 0; j < 4; j++)
+          p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * 
simdWidth * typesize ));
+      } else {
+        for (uint32_t i = 0; i < typesize / 2; i++) {
+          if (i > 0) {
+            p->push();
+            {
+              // Update the address in header
+              p->curr.execWidth = 1;
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
+            }
+            p->pop();
+          }
+          p->OBREAD(tmp, header, insn.getbti(), 8);
+          for (uint32_t j = 0; j < 8 / typesize ; j++)
+            p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), 
GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
+        }
+      }
+    } else if (vec_size == 8) {
+      if (simdWidth == 8) {
+        for (uint32_t i = 0; i < typesize / 2; i++) {
           if (i > 0) {
             p->push();
             {
@@ -3564,11 +3593,11 @@ namespace gbe
             p->OBREAD(tmp, header, insn.getbti(), 8);
           }
           p->pop();
-          for (uint32_t j = 0; j < 4; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), 
GenRegister::offset(tmp, j));
+          for (uint32_t j = 0; j < 16 / typesize; j++)
+            p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), 
GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
         }
       } else {
-        for (uint32_t i = 0; i < vec_size / 2; i++) {
+        for (uint32_t i = 0; i < typesize ; i++) {
           if (i > 0) {
             p->push();
             {
@@ -3579,8 +3608,8 @@ namespace gbe
             p->pop();
           }
           p->OBREAD(tmp, header, insn.getbti(), 8);
-          for (uint32_t j = 0; j < 2; j++)
-            p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), 
GenRegister::offset(tmp, j*2));
+          for (uint32_t j = 0; j < 8 / typesize; j++)
+            p->MOV(ra->genReg(insn.dst(1 + j + i * 8 / typesize)), 
GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
         }
       }
     } else NOT_SUPPORTED;
@@ -3590,6 +3619,8 @@ namespace gbe
     const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
     const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), 
GEN_TYPE_UD);
     const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
+    uint32_t type = ra->genReg(insn.src(1)).type;
+    uint32_t typesize = typeSize(type);
     const uint32_t vec_size = insn.extra.elem;
     const GenRegister tmp = GenRegister::offset(header, 1);
     const uint32_t simdWidth = p->curr.execWidth;
@@ -3613,29 +3644,56 @@ namespace gbe
     p->pop();
     // Now write the data, oword block write can only work with simd16 and no 
mask
     if (vec_size == 1) {
-      p->MOV(tmp, ra->genReg(insn.src(1)));
+      p->MOV(GenRegister::retype(tmp, type), ra->genReg(insn.src(1)));
       p->push();
       {
         p->curr.execWidth = 16;
         p->curr.noMask = 1;
-        p->OBWRITE(header, insn.getbti(), simdWidth / 4);
+        p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 16);
       }
       p->pop();
     } else if (vec_size == 2) {
-      p->MOV(GenRegister::offset(tmp, 0), ra->genReg(insn.src(1))) ;
-      p->MOV(GenRegister::offset(tmp, simdWidth / 8), ra->genReg(insn.src(2))) 
;
+      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, 0), type), 
ra->genReg(insn.src(1)));
+      p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, simdWidth * 
typesize), type), ra->genReg(insn.src(2)));
       p->push();
       {
         p->curr.execWidth = 16;
         p->curr.noMask = 1;
-        p->OBWRITE(header, insn.getbti(), simdWidth / 2);
+        p->OBWRITE(header, insn.getbti(), simdWidth * typesize / 8);
       }
       p->pop();
-    } else if (vec_size == 4 || vec_size == 8) {
+    } else if (vec_size == 4) {
       if (simdWidth == 8) {
-        for (uint32_t i = 0; i < vec_size / 4; i++) {
-          for (uint32_t j = 0; j < 4; j++)
-            p->MOV(GenRegister::offset(tmp, j), ra->genReg(insn.src(1 + j + 
i*4))) ;
+        for (uint32_t i = 0; i < 4; i++)
+          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * simdWidth 
* typesize), type), ra->genReg(insn.src(1 + i)));
+        p->push();
+        {
+          p->curr.execWidth = 16;
+          p->curr.noMask = 1;
+          p->OBWRITE(header, insn.getbti(), 2 * typesize);
+        }
+        p->pop();
+      } else {
+        for (uint32_t i = 0; i < typesize / 2; i++) {
+          for (uint32_t j = 0; j < 8 / typesize; j++)
+            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * 
simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
+          if (i > 0) {
+            p->push();
+            {
+              // Update the address in header
+              p->curr.execWidth = 1;
+              p->ADD(headeraddr, headeraddr, GenRegister::immud(8));
+            }
+            p->pop();
+          }
+          p->OBWRITE(header, insn.getbti(), 8);
+        }
+      }
+    } else if (vec_size == 8) {
+      if (simdWidth == 8) {
+        for (uint32_t i = 0; i < typesize / 2; i++) {
+          for (uint32_t j = 0; j < 16 / typesize; j++)
+            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * 
simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 16 / typesize)));
           if (i > 0) {
             p->push();
             {
@@ -3654,9 +3712,9 @@ namespace gbe
           p->pop();
         }
       } else {
-        for (uint32_t i = 0; i < vec_size / 2; i++) {
-          for (uint32_t j = 0; j < 2; j++)
-            p->MOV(GenRegister::offset(tmp, j * 2), ra->genReg(insn.src(1 + j 
+ i*2))) ;
+        for (uint32_t i = 0; i < typesize; i++) {
+          for (uint32_t j = 0; j < 8 / typesize; j++)
+            p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, j * 
simdWidth * typesize), type), ra->genReg(insn.src(1 + j + i * 8 / typesize)));
           if (i > 0) {
             p->push();
             {
@@ -3682,7 +3740,10 @@ namespace gbe
     const GenRegister offsety = GenRegister::offset(header, 0, 1*4);
     const GenRegister blocksizereg = GenRegister::offset(header, 0, 2*4);
     size_t vec_size = insn.extra.elem;
-    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
+    uint32_t type = dst.type;
+    uint32_t typesize = typeSize(type);
+    uint32_t block_width = typesize * simdWidth;
+    uint32_t blocksize = (block_width - 1) % 32 | (vec_size - 1) << 16;
 
     if (simdWidth == 8)
     {
@@ -3699,9 +3760,12 @@ namespace gbe
         p->MOV(offsety, coordy);
         // Update block width and height
         p->MOV(blocksizereg, GenRegister::immud(blocksize));
-        // Now read the data
         p->curr.execWidth = 8;
-        p->MBREAD(dst, header, insn.getbti(), vec_size);
+        // ushort in simd8 will have half reg, but response lenght is still 1
+        uint32_t rsize = vec_size * typesize / 4;
+        rsize = rsize ? rsize : 1;
+        // Now read the data
+        p->MBREAD(dst, header, insn.getbti(), rsize);
       p->pop();
 
     }
@@ -3726,21 +3790,24 @@ namespace gbe
         p->curr.execWidth = 8;
         p->MBREAD(tmp, header, insn.getbti(), vec_size);
         for (uint32_t i = 0; i < vec_size; i++)
-          p->MOV(ra->genReg(insn.dst(i + 1)), GenRegister::offset(tmp, i));
-
-        // Second half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
-        // Now read the data
-        p->curr.execWidth = 8;
-        p->MBREAD(tmp, header, insn.getbti(), vec_size);
+          p->MOV(GenRegister::retype(ra->genReg(insn.dst(i + 1)),GEN_TYPE_UD), 
GenRegister::offset(tmp, i));
 
-        // Move the reg to fit vector rule.
-        for (uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
-                 GenRegister::offset(tmp, i));
+        if (typesize == 4)
+        {
+          // Second half
+          // Update the header with the coord
+          p->curr.execWidth = 1;
+          p->ADD(offsetx, offsetx, GenRegister::immud(32));
+
+          // Now read the data
+          p->curr.execWidth = 8;
+          p->MBREAD(tmp, header, insn.getbti(), vec_size);
+
+          // Move the reg to fit vector rule.
+          for (uint32_t i = 0; i < vec_size; i++)
+            p->MOV(GenRegister::offset(ra->genReg(insn.dst(i + 1)), 1),
+                   GenRegister::offset(tmp, i));
+        }
       p->pop();
     } else NOT_IMPLEMENTED;
   }
@@ -3749,9 +3816,13 @@ namespace gbe
     const GenRegister coordx = GenRegister::toUniform(ra->genReg(insn.src(0)), 
GEN_TYPE_D);
     const GenRegister coordy = GenRegister::toUniform(ra->genReg(insn.src(1)), 
GEN_TYPE_D);
     const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), 
GEN_TYPE_UD);
+    const GenRegister tmp = GenRegister::offset(header, 1);
     GenRegister offsetx, offsety, blocksizereg;
     size_t vec_size = insn.extra.elem;
-    uint32_t blocksize = 0x1F | (vec_size-1) << 16;
+    uint32_t type = ra->genReg(insn.src(2)).type;
+    uint32_t typesize = typeSize(type);
+    uint32_t block_width = typesize * simdWidth;
+    uint32_t blocksize = (block_width - 1) % 32 | (vec_size-1) << 16;
 
     offsetx = GenRegister::offset(header, 0, 0*4);
     offsety = GenRegister::offset(header, 0, 1*4);
@@ -3775,9 +3846,13 @@ namespace gbe
         p->curr.execWidth = 8;
         // Mov what we need into msgs
         for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + 
i)));
+          p->MOV(GenRegister::retype(GenRegister::offset(tmp, 0, i * 
block_width), type),
+                 ra->genReg(insn.src(2 + i)));
+        // ushort in simd8 will have half reg, but reponse lenght is still 1
+        uint32_t rsize = vec_size * typesize / 4;
+        rsize = rsize ? rsize : 1;
         // Now read the data
-        p->MBWRITE(header, insn.getbti(), vec_size);
+        p->MBWRITE(header, insn.getbti(), rsize);
       p->pop();
 
     }
@@ -3801,20 +3876,23 @@ namespace gbe
         p->curr.execWidth = 8;
         // Mov what we need into msgs
         for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(header, 1 + i), ra->genReg(insn.src(2 + 
i)));
+          p->MOV(GenRegister::offset(tmp, i), 
GenRegister::retype(ra->genReg(insn.src(2 + i)), GEN_TYPE_UD));
         p->MBWRITE(header, insn.getbti(), vec_size);
 
-        // Second half
-        // Update the header with the coord
-        p->curr.execWidth = 1;
-        p->ADD(offsetx, offsetx, GenRegister::immud(32));
-
-        p->curr.execWidth = 8;
-        // Mov what we need into msgs
-        for(uint32_t i = 0; i < vec_size; i++)
-          p->MOV(GenRegister::offset(header, 1 + i), 
GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
-        // Now write the data
-        p->MBWRITE(header, insn.getbti(), vec_size);
+        if (typesize == 4)
+        {
+          // Second half
+          // Update the header with the coord
+          p->curr.execWidth = 1;
+          p->ADD(offsetx, offsetx, GenRegister::immud(32));
+
+          p->curr.execWidth = 8;
+          // Mov what we need into msgs
+          for(uint32_t i = 0; i < vec_size; i++)
+            p->MOV(GenRegister::offset(header, 1 + i), 
GenRegister::Qn(ra->genReg(insn.src(2 + i)), 1));
+          // Now write the data
+          p->MBWRITE(header, insn.getbti(), vec_size);
+        }
 
       p->pop();
     }
diff --git a/backend/src/backend/gen_encoder.cpp 
b/backend/src/backend/gen_encoder.cpp
index 975e1c7..a6f8db8 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -269,10 +269,10 @@ namespace gbe
   {
     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
     p->setMessageDescriptor(insn, sfid, msg_length, response_length);
-    assert(size == 2 || size == 4 || size == 8);
+    assert(size == 0 || size == 1 || size == 2 || size == 4 || size == 8);
     insn->bits3.gen7_oblock_rw.msg_type = msg_type;
     insn->bits3.gen7_oblock_rw.bti = bti;
-    insn->bits3.gen7_oblock_rw.block_size = size == 2 ? 2 : (size == 4 ? 3 : 
4);
+    insn->bits3.gen7_oblock_rw.block_size = size <=  2 ? size : (size == 4 ? 3 
: 4);
     insn->bits3.gen7_oblock_rw.header_present = 1;
   }
 
@@ -1261,7 +1261,17 @@ namespace gbe
   void GenEncoder::OBREAD(GenRegister dst, GenRegister header, uint32_t bti, 
uint32_t size) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
     const uint32_t msg_length = 1;
-    const uint32_t response_length = size / 2; // Size is in regs
+    uint32_t rsize = size / 2;
+    uint32_t msgsize = size;
+    // When size is 1 OWord, which means half a reg, we need to know which 
half to use
+    if (size == 1) {
+      if (dst.subnr == 0)
+        msgsize = 0;
+      else
+        msgsize = 1;
+    }
+    rsize = rsize == 0 ? 1 : rsize;
+    const uint32_t response_length = rsize; // Size is in regs
     this->setHeader(insn);
     this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
@@ -1269,7 +1279,7 @@ namespace gbe
     setOBlockRW(this,
                 insn,
                 bti,
-                size,
+                msgsize,
                 GEN7_UNALIGNED_OBLOCK_READ,
                 msg_length,
                 response_length);
@@ -1277,8 +1287,12 @@ namespace gbe
 
   void GenEncoder::OBWRITE(GenRegister header, uint32_t bti, uint32_t size) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
-    const uint32_t msg_length = 1 + size / 2; // Size is in owords
+    uint32_t rsize = size / 2;
+    rsize = rsize == 0 ? 1 : rsize;
+    const uint32_t msg_length = 1 + rsize; // Size is in owords
     const uint32_t response_length = 0;
+    uint32_t msgsize = size;
+    msgsize = msgsize == 1 ? 0 : msgsize;
     this->setHeader(insn);
     this->setSrc0(insn, GenRegister::ud8grf(header.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
@@ -1286,7 +1300,7 @@ namespace gbe
     setOBlockRW(this,
                 insn,
                 bti,
-                size,
+                msgsize,
                 GEN7_OBLOCK_WRITE,
                 msg_length,
                 response_length);
diff --git a/backend/src/backend/gen_insn_selection.cpp 
b/backend/src/backend/gen_insn_selection.cpp
index d506d96..475cad8 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -2089,7 +2089,6 @@ namespace gbe
 
     uint32_t simdWidth = curr.execWidth;
     SelectionInstruction *insn = this->appendInsn(SEL_OP_MBREAD, vec_size * 
simdWidth / 8 + 1, 2);
-
     insn->dst(0) = header;
     for (uint32_t i = 0; i < vec_size; ++i) {
       insn->dst(i + 1) = dsts[i];
@@ -4147,16 +4146,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
       using namespace ir;
       const uint32_t vec_size = insn.getValueNum();
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+      const Type type = insn.getValueType();
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
       vector<GenRegister> valuesVec;
       for(uint32_t i = 0; i < vec_size; i++)
-        valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
+        valuesVec.push_back(sel.selReg(insn.getValue(i), type));
       // check tmp_size for OWORD read need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size / 8;
+      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
+      tmp_size = tmp_size == 0 ? 1 : tmp_size;
       tmp_size = tmp_size > 4 ? 4 : tmp_size;
       vector<GenRegister> tmpVec;
       for(uint32_t i = 0; i < tmp_size; i++)
-        
tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), 
TYPE_U32));
+        
tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), 
GEN_TYPE_UD));
       sel.OBREAD(&valuesVec[0], vec_size, address, header, bti.imm, 
&tmpVec[0], tmp_size);
     }
 
@@ -4332,16 +4334,19 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
       using namespace ir;
       const uint32_t vec_size = insn.getValueNum();
       const uint32_t simdWidth = sel.ctx.getSimdWidth();
-      const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+      const Type type = insn.getValueType();
+      const uint32_t typeSize = type == TYPE_U32 ? 4 : 2;
+      const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
       vector<GenRegister> valuesVec;
       for(uint32_t i = 0; i < vec_size; i++)
-        valuesVec.push_back(sel.selReg(insn.getValue(i), TYPE_U32));
+        valuesVec.push_back(sel.selReg(insn.getValue(i), type));
       // check tmp_size for OWORD write need, max 8 OWROD thus 4 regs
-      uint32_t tmp_size = simdWidth * vec_size / 8;
+      uint32_t tmp_size = simdWidth * vec_size * typeSize / 32;
+      tmp_size = tmp_size == 0 ? 1 : tmp_size;
       tmp_size = tmp_size > 4 ? 4 : tmp_size;
       vector<GenRegister> tmpVec;
       for(uint32_t i = 0; i < tmp_size; i++)
-        
tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), 
TYPE_U32));
+        
tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), 
GEN_TYPE_UD));
       sel.OBWRITE(address, &valuesVec[0], vec_size, header, bti.imm, 
&tmpVec[0], tmp_size);
     }
 
@@ -6703,16 +6708,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
       using namespace ir;
       uint32_t vec_size = insn.getVectorSize();
       uint32_t simdWidth = sel.curr.execWidth;
+      const Type type = insn.getType();
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
       for (uint32_t i = 0; i < vec_size; ++i) {
-        valuesVec.push_back(sel.selReg(insn.getDst(i), TYPE_U32));
+        valuesVec.push_back(sel.selReg(insn.getDst(i), type));
         if(simdWidth == 16)
-          
tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), 
TYPE_U32));
+          
tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), 
GEN_TYPE_UD));
       }
       const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
       const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
-      const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+      const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
       GenRegister *tmp = NULL;
       if(simdWidth == 16)
         tmp = &tmpVec[0];
@@ -6729,16 +6735,17 @@ extern bool OCL_DEBUGINFO; // first defined by calling 
BVAR in program.cpp
     {
       using namespace ir;
       uint32_t vec_size = insn.getVectorSize();
+      const Type type = insn.getType();
       const GenRegister coordx = sel.selReg(insn.getSrc(0), TYPE_U32);
       const GenRegister coordy = sel.selReg(insn.getSrc(1), TYPE_U32);
       vector<GenRegister> valuesVec;
       vector<GenRegister> tmpVec;
       for(uint32_t i = 0; i < vec_size; i++)
       {
-        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), TYPE_U32));
-        
tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), 
TYPE_U32));
+        valuesVec.push_back(sel.selReg(insn.getSrc(2 + i), type));
+        
tmpVec.push_back(GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), 
GEN_TYPE_UD));
       }
-      const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), TYPE_U32);
+      const GenRegister header = 
GenRegister::retype(GenRegister::f8grf(sel.reg(FAMILY_DWORD)), GEN_TYPE_UD);
       sel.MBWRITE(coordx, coordy, &valuesVec[0], header, &tmpVec[0], 
insn.getImageIndex(), vec_size);
       return true;
     }
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 08a94cd..512055c 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1070,18 +1070,20 @@ namespace ir {
       public TupleDstPolicy<MediaBlockReadInstruction>
     {
     public:
-      INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t 
vec_size, Tuple srcTuple, uint8_t srcNum) {
+      INLINE MediaBlockReadInstruction(uint8_t imageIdx, Tuple dst, uint8_t 
vec_size, Tuple srcTuple, uint8_t srcNum, Type type) {
         this->opcode = OP_MBREAD;
         this->dst = dst;
         this->dstNum = vec_size;
         this->src = srcTuple;
         this->srcNum = srcNum;
         this->imageIdx = imageIdx;
+        this->type = type;
       }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
         this->outOpcode(out);
-        out << (int)this->getVectorSize();
+        out << "." << type << "."
+            << (int)this->getVectorSize();
         out << " {";
         for (uint32_t i = 0; i < dstNum; ++i)
           out << "%" << this->getDst(fn, i) << (i != (dstNum-1u) ? " " : "");
@@ -1092,12 +1094,14 @@ namespace ir {
       }
       INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE uint8_t getVectorSize(void) const { return this->dstNum; }
+      INLINE Type getType(void) const { return this->type; }
 
       Tuple src;
       Tuple dst;
       uint8_t imageIdx;
       uint8_t srcNum;
       uint8_t dstNum;
+      Type type;
     };
 
     class ALIGNED_INSTRUCTION MediaBlockWriteInstruction :
@@ -1107,17 +1111,19 @@ namespace ir {
     {
     public:
 
-      INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, 
uint8_t srcNum, uint8_t vec_size) {
+      INLINE MediaBlockWriteInstruction(uint8_t imageIdx, Tuple srcTuple, 
uint8_t srcNum, uint8_t vec_size, Type type) {
         this->opcode = OP_MBWRITE;
         this->src = srcTuple;
         this->srcNum = srcNum;
         this->imageIdx = imageIdx;
         this->vec_size = vec_size;
+        this->type = type;
       }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const {
         this->outOpcode(out);
-        out << (int)this->getVectorSize()
+        out << "." << type << "."
+            << (int)this->getVectorSize()
             << " 2D surface id " << (int)this->getImageIndex()
             << " byte coord x %" << this->getSrc(fn, 0)
             << " row coord y %" << this->getSrc(fn, 1);
@@ -1128,12 +1134,14 @@ namespace ir {
       }
       INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE uint8_t getVectorSize(void) const { return this->vec_size; }
+      INLINE Type getType(void) const { return this->type; }
 
       Tuple src;
       Register dst[0];
       uint8_t imageIdx;
       uint8_t srcNum;
       uint8_t vec_size;
+      Type type;
     };
 
 #undef ALIGNED_INSTRUCTION
@@ -2375,8 +2383,10 @@ DECL_MEM_FN(PrintfInstruction, uint32_t, getBti(void), 
getBti())
 DECL_MEM_FN(PrintfInstruction, Type, getType(const Function& fn, uint32_t ID), 
getType(fn, ID))
 DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getImageIndex(void), 
getImageIndex())
 DECL_MEM_FN(MediaBlockReadInstruction, uint8_t, getVectorSize(void), 
getVectorSize())
+DECL_MEM_FN(MediaBlockReadInstruction, Type, getType(void), getType())
 DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getImageIndex(void), 
getImageIndex())
 DECL_MEM_FN(MediaBlockWriteInstruction, uint8_t, getVectorSize(void), 
getVectorSize())
+DECL_MEM_FN(MediaBlockWriteInstruction, Type, getType(void), getType())
 
 #undef DECL_MEM_FN
 
@@ -2684,12 +2694,12 @@ DECL_MEM_FN(MemInstruction, void,     
setBtiReg(Register reg), setBtiReg(reg))
     return internal::PrintfInstruction(dst, srcTuple, typeTuple, srcNum, bti, 
num).convert();
   }
 
-  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple 
coord, uint8_t srcNum) {
-    return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, 
coord, srcNum).convert();
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple 
coord, uint8_t srcNum, Type type) {
+    return internal::MediaBlockReadInstruction(imageIndex, dst, vec_size, 
coord, srcNum, type).convert();
   }
 
-  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, 
uint8_t vec_size) {
-    return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, 
vec_size).convert();
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, 
uint8_t vec_size, Type type) {
+    return internal::MediaBlockWriteInstruction(imageIndex, srcTuple, srcNum, 
vec_size, type).convert();
   }
 
 
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index b2b0b49..98cead1 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -642,6 +642,7 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
     uint8_t getImageIndex() const;
     uint8_t getVectorSize() const;
+    Type getType(void) const;
   };
 
   /*! Media Block Write.  */
@@ -651,6 +652,7 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
     uint8_t getImageIndex() const;
     uint8_t getVectorSize() const;
+    Type getType(void) const;
   };
 
   /*! Specialize the instruction. Also performs typechecking first based on the
@@ -886,9 +888,9 @@ namespace ir {
   /*! printf */
   Instruction PRINTF(Register dst, Tuple srcTuple, Tuple typeTuple, uint8_t 
srcNum, uint8_t bti, uint16_t num);
   /*! media block read */
-  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple 
coord, uint8_t srcNum);
+  Instruction MBREAD(uint8_t imageIndex, Tuple dst, uint8_t vec_size, Tuple 
coord, uint8_t srcNum, Type type);
   /*! media block write */
-  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, 
uint8_t vec_size);
+  Instruction MBWRITE(uint8_t imageIndex, Tuple srcTuple, uint8_t srcNum, 
uint8_t vec_size, Type type);
 } /* namespace ir */
 } /* namespace gbe */
 
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl 
b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 9023107..97e33fe 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -188,90 +188,237 @@ INTEL_RANGE_OP(scan_exclusive, max, short, true)
 INTEL_RANGE_OP(scan_exclusive, max, ushort, false)
 
 #undef INTEL_RANGE_OP
-PURE CONST uint __gen_ocl_sub_group_block_read_mem(const global uint* p);
-PURE CONST uint2 __gen_ocl_sub_group_block_read_mem2(const global uint* p);
-PURE CONST uint4 __gen_ocl_sub_group_block_read_mem4(const global uint* p);
-PURE CONST uint8 __gen_ocl_sub_group_block_read_mem8(const global uint* p);
+PURE CONST uint __gen_ocl_sub_group_block_read_ui_mem(const global uint* p);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_mem2(const global uint* p);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_mem4(const global uint* p);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_mem8(const global uint* p);
 OVERLOADABLE uint intel_sub_group_block_read(const global uint* p)
 {
-  return __gen_ocl_sub_group_block_read_mem(p);
+  return __gen_ocl_sub_group_block_read_ui_mem(p);
 }
 OVERLOADABLE uint2 intel_sub_group_block_read2(const global uint* p)
 {
-  return __gen_ocl_sub_group_block_read_mem2(p);
+  return __gen_ocl_sub_group_block_read_ui_mem2(p);
 }
 OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p)
 {
-  return __gen_ocl_sub_group_block_read_mem4(p);
-
+  return __gen_ocl_sub_group_block_read_ui_mem4(p);
 }
 OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p)
 {
-  return __gen_ocl_sub_group_block_read_mem8(p);
+  return __gen_ocl_sub_group_block_read_ui_mem8(p);
+}
+OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p)
+{
+  return __gen_ocl_sub_group_block_read_ui_mem(p);
 }
-void __gen_ocl_sub_group_block_write_mem(const global uint* p, uint data);
-void __gen_ocl_sub_group_block_write_mem2(const global uint* p, uint2 data);
-void __gen_ocl_sub_group_block_write_mem4(const global uint* p, uint4 data);
-void __gen_ocl_sub_group_block_write_mem8(const global uint* p, uint8 data);
-OVERLOADABLE void intel_sub_group_block_write(const global uint* p, uint data)
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p)
 {
-  __gen_ocl_sub_group_block_write_mem(p, data);
+  return __gen_ocl_sub_group_block_read_ui_mem2(p);
 }
-OVERLOADABLE void intel_sub_group_block_write2(const global uint* p, uint2 
data)
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p)
 {
-  __gen_ocl_sub_group_block_write_mem2(p, data);
+  return __gen_ocl_sub_group_block_read_ui_mem4(p);
 }
-OVERLOADABLE void intel_sub_group_block_write4(const global uint* p,uint4 data)
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p)
 {
-  __gen_ocl_sub_group_block_write_mem4(p, data);
+  return __gen_ocl_sub_group_block_read_ui_mem8(p);
+}
 
+void __gen_ocl_sub_group_block_write_ui_mem(global uint* p, uint data);
+void __gen_ocl_sub_group_block_write_ui_mem2(global uint* p, uint2 data);
+void __gen_ocl_sub_group_block_write_ui_mem4(global uint* p, uint4 data);
+void __gen_ocl_sub_group_block_write_ui_mem8(global uint* p, uint8 data);
+OVERLOADABLE void intel_sub_group_block_write(global uint* p, uint data)
+{
+  __gen_ocl_sub_group_block_write_ui_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write2(global uint* p, uint2 data)
+{
+  __gen_ocl_sub_group_block_write_ui_mem2(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write4(global uint* p,uint4 data)
+{
+  __gen_ocl_sub_group_block_write_ui_mem4(p, data);
 }
-OVERLOADABLE void intel_sub_group_block_write8(const global uint* p,uint8 data)
+OVERLOADABLE void intel_sub_group_block_write8(global uint* p,uint8 data)
 {
-  __gen_ocl_sub_group_block_write_mem8(p, data);
+  __gen_ocl_sub_group_block_write_ui_mem8(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui(global uint* p, uint data)
+{
+  __gen_ocl_sub_group_block_write_ui_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui2(global uint* p, uint2 data)
+{
+  __gen_ocl_sub_group_block_write_ui_mem2(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui4(global uint* p,uint4 data)
+{
+  __gen_ocl_sub_group_block_write_ui_mem4(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui8(global uint* p,uint8 data)
+{
+  __gen_ocl_sub_group_block_write_ui_mem8(p, data);
 }
 
-PURE CONST uint __gen_ocl_sub_group_block_read_image(image2d_t p, int x, int 
y);
-PURE CONST uint2 __gen_ocl_sub_group_block_read_image2(image2d_t p, int x, int 
y);
-PURE CONST uint4 __gen_ocl_sub_group_block_read_image4(image2d_t p, int x, int 
y);
-PURE CONST uint8 __gen_ocl_sub_group_block_read_image8(image2d_t p, int x, int 
y);
+PURE CONST uint __gen_ocl_sub_group_block_read_ui_image(image2d_t p, int x, 
int y);
+PURE CONST uint2 __gen_ocl_sub_group_block_read_ui_image2(image2d_t p, int x, 
int y);
+PURE CONST uint4 __gen_ocl_sub_group_block_read_ui_image4(image2d_t p, int x, 
int y);
+PURE CONST uint8 __gen_ocl_sub_group_block_read_ui_image8(image2d_t p, int x, 
int y);
 OVERLOADABLE uint intel_sub_group_block_read(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_image(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
 }
 OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_image2(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
 }
 OVERLOADABLE uint4 intel_sub_group_block_read4(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_image4(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
 }
 OVERLOADABLE uint8 intel_sub_group_block_read8(image2d_t p, int2 cord)
 {
-  return __gen_ocl_sub_group_block_read_image8(p, cord.x, cord.y);
+  return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
 }
-void __gen_ocl_sub_group_block_write_image(image2d_t p, int x, int y, uint 
data);
-void __gen_ocl_sub_group_block_write_image2(image2d_t p, int x, int y, uint2 
data);
-void __gen_ocl_sub_group_block_write_image4(image2d_t p, int x, int y, uint4 
data);
-void __gen_ocl_sub_group_block_write_image8(image2d_t p, int x, int y, uint8 
data);
+OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_ui_image(p, cord.x, cord.y);
+}
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_ui_image2(p, cord.x, cord.y);
+}
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_ui_image4(p, cord.x, cord.y);
+}
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_ui_image8(p, cord.x, cord.y);
+}
+
+void __gen_ocl_sub_group_block_write_ui_image(image2d_t p, int x, int y, uint 
data);
+void __gen_ocl_sub_group_block_write_ui_image2(image2d_t p, int x, int y, 
uint2 data);
+void __gen_ocl_sub_group_block_write_ui_image4(image2d_t p, int x, int y, 
uint4 data);
+void __gen_ocl_sub_group_block_write_ui_image8(image2d_t p, int x, int y, 
uint8 data);
 OVERLOADABLE void intel_sub_group_block_write(image2d_t p, int2 cord, uint 
data)
 {
-  __gen_ocl_sub_group_block_write_image(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
 }
 OVERLOADABLE void intel_sub_group_block_write2(image2d_t p, int2 cord, uint2 
data)
 {
-  __gen_ocl_sub_group_block_write_image2(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
 }
 OVERLOADABLE void intel_sub_group_block_write4(image2d_t p, int2 cord, uint4 
data)
 {
-  __gen_ocl_sub_group_block_write_image4(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
 }
 OVERLOADABLE void intel_sub_group_block_write8(image2d_t p, int2 cord, uint8 
data)
 {
-  __gen_ocl_sub_group_block_write_image8(p, cord.x, cord.y, data);
+  __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t p, int2 cord, uint 
data)
+{
+  __gen_ocl_sub_group_block_write_ui_image(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t p, int2 cord, 
uint2 data)
+{
+  __gen_ocl_sub_group_block_write_ui_image2(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t p, int2 cord, 
uint4 data)
+{
+  __gen_ocl_sub_group_block_write_ui_image4(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t p, int2 cord, 
uint8 data)
+{
+  __gen_ocl_sub_group_block_write_ui_image8(p, cord.x, cord.y, data);
 }
 
+PURE CONST ushort __gen_ocl_sub_group_block_read_us_mem(const global ushort* 
p);
+PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_mem2(const global ushort* 
p);
+PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_mem4(const global ushort* 
p);
+PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_mem8(const global ushort* 
p);
+OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p)
+{
+  return __gen_ocl_sub_group_block_read_us_mem(p);
+}
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p)
+{
+  return __gen_ocl_sub_group_block_read_us_mem2(p);
+}
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p)
+{
+  return __gen_ocl_sub_group_block_read_us_mem4(p);
+}
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p)
+{
+  return __gen_ocl_sub_group_block_read_us_mem8(p);
+}
+
+void __gen_ocl_sub_group_block_write_us_mem(global ushort* p, ushort data);
+void __gen_ocl_sub_group_block_write_us_mem2(global ushort* p, ushort2 data);
+void __gen_ocl_sub_group_block_write_us_mem4(global ushort* p, ushort4 data);
+void __gen_ocl_sub_group_block_write_us_mem8(global ushort* p, ushort8 data);
+OVERLOADABLE void intel_sub_group_block_write_us(global ushort* p, ushort data)
+{
+  __gen_ocl_sub_group_block_write_us_mem(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us2(global ushort* p, ushort2 
data)
+{
+  __gen_ocl_sub_group_block_write_us_mem2(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us4(global ushort* p,ushort4 
data)
+{
+  __gen_ocl_sub_group_block_write_us_mem4(p, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us8(global ushort* p,ushort8 
data)
+{
+  __gen_ocl_sub_group_block_write_us_mem8(p, data);
+}
+
+PURE CONST ushort __gen_ocl_sub_group_block_read_us_image(image2d_t p, int x, 
int y);
+PURE CONST ushort2 __gen_ocl_sub_group_block_read_us_image2(image2d_t p, int 
x, int y);
+PURE CONST ushort4 __gen_ocl_sub_group_block_read_us_image4(image2d_t p, int 
x, int y);
+PURE CONST ushort8 __gen_ocl_sub_group_block_read_us_image8(image2d_t p, int 
x, int y);
+OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_us_image(p, cord.x, cord.y);
+}
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_us_image2(p, cord.x, cord.y);
+}
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_us_image4(p, cord.x, cord.y);
+}
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t p, int2 cord)
+{
+  return __gen_ocl_sub_group_block_read_us_image8(p, cord.x, cord.y);
+}
+
+void __gen_ocl_sub_group_block_write_us_image(image2d_t p, int x, int y, 
ushort data);
+void __gen_ocl_sub_group_block_write_us_image2(image2d_t p, int x, int y, 
ushort2 data);
+void __gen_ocl_sub_group_block_write_us_image4(image2d_t p, int x, int y, 
ushort4 data);
+void __gen_ocl_sub_group_block_write_us_image8(image2d_t p, int x, int y, 
ushort8 data);
+OVERLOADABLE void intel_sub_group_block_write_us(image2d_t p, int2 cord, 
ushort data)
+{
+  __gen_ocl_sub_group_block_write_us_image(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t p, int2 cord, 
ushort2 data)
+{
+  __gen_ocl_sub_group_block_write_us_image2(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t p, int2 cord, 
ushort4 data)
+{
+  __gen_ocl_sub_group_block_write_us_image4(p, cord.x, cord.y, data);
+}
+OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t p, int2 cord, 
ushort8 data)
+{
+  __gen_ocl_sub_group_block_write_us_image8(p, cord.x, cord.y, data);
+}
 #define SHUFFLE_DOWN(TYPE) \
 OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \
   TYPE res0, res1; \
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h 
b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index 158c8e1..608551b 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -196,10 +196,10 @@ OVERLOADABLE uint2 intel_sub_group_block_read2(const 
global uint* p);
 OVERLOADABLE uint4 intel_sub_group_block_read4(const global uint* p);
 OVERLOADABLE uint8 intel_sub_group_block_read8(const global uint* p);
 
-OVERLOADABLE void intel_sub_group_block_write(const __global uint* p, uint 
data);
-OVERLOADABLE void intel_sub_group_block_write2(const __global uint* p, uint2 
data);
-OVERLOADABLE void intel_sub_group_block_write4(const __global uint* p, uint4 
data);
-OVERLOADABLE void intel_sub_group_block_write8(const __global uint* p, uint8 
data);
+OVERLOADABLE void intel_sub_group_block_write(__global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write2(__global uint* p, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write4(__global uint* p, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write8(__global uint* p, uint8 data);
 
 OVERLOADABLE uint intel_sub_group_block_read(image2d_t image, int2 byte_coord);
 OVERLOADABLE uint2 intel_sub_group_block_read2(image2d_t image, int2 
byte_coord);
@@ -210,3 +210,43 @@ OVERLOADABLE void intel_sub_group_block_write(image2d_t 
image, int2 byte_coord,
 OVERLOADABLE void intel_sub_group_block_write2(image2d_t image, int2 
byte_coord, uint2 data);
 OVERLOADABLE void intel_sub_group_block_write4(image2d_t image, int2 
byte_coord, uint4 data);
 OVERLOADABLE void intel_sub_group_block_write8(image2d_t image, int2 
byte_coord, uint8 data);
+
+OVERLOADABLE uint intel_sub_group_block_read_ui(const global uint* p);
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(const global uint* p);
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(const global uint* p);
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(const global uint* p);
+
+OVERLOADABLE void intel_sub_group_block_write_ui(__global uint* p, uint data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(__global uint* p, uint2 
data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(__global uint* p, uint4 
data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(__global uint* p, uint8 
data);
+
+OVERLOADABLE uint intel_sub_group_block_read_ui(image2d_t image, int2 
byte_coord);
+OVERLOADABLE uint2 intel_sub_group_block_read_ui2(image2d_t image, int2 
byte_coord);
+OVERLOADABLE uint4 intel_sub_group_block_read_ui4(image2d_t image, int2 
byte_coord);
+OVERLOADABLE uint8 intel_sub_group_block_read_ui8(image2d_t image, int2 
byte_coord);
+
+OVERLOADABLE void intel_sub_group_block_write_ui(image2d_t image, int2 
byte_coord, uint data);
+OVERLOADABLE void intel_sub_group_block_write_ui2(image2d_t image, int2 
byte_coord, uint2 data);
+OVERLOADABLE void intel_sub_group_block_write_ui4(image2d_t image, int2 
byte_coord, uint4 data);
+OVERLOADABLE void intel_sub_group_block_write_ui8(image2d_t image, int2 
byte_coord, uint8 data);
+
+OVERLOADABLE ushort intel_sub_group_block_read_us(const global ushort* p);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(const global ushort* p);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(const global ushort* p);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(const global ushort* p);
+
+OVERLOADABLE void intel_sub_group_block_write_us(__global ushort* p, ushort 
data);
+OVERLOADABLE void intel_sub_group_block_write_us2(__global ushort* p, ushort2 
data);
+OVERLOADABLE void intel_sub_group_block_write_us4(__global ushort* p, ushort4 
data);
+OVERLOADABLE void intel_sub_group_block_write_us8(__global ushort* p, ushort8 
data);
+
+OVERLOADABLE ushort intel_sub_group_block_read_us(image2d_t image, int2 
byte_coord);
+OVERLOADABLE ushort2 intel_sub_group_block_read_us2(image2d_t image, int2 
byte_coord);
+OVERLOADABLE ushort4 intel_sub_group_block_read_us4(image2d_t image, int2 
byte_coord);
+OVERLOADABLE ushort8 intel_sub_group_block_read_us8(image2d_t image, int2 
byte_coord);
+
+OVERLOADABLE void intel_sub_group_block_write_us(image2d_t image, int2 
byte_coord, ushort data);
+OVERLOADABLE void intel_sub_group_block_write_us2(image2d_t image, int2 
byte_coord, ushort2 data);
+OVERLOADABLE void intel_sub_group_block_write_us4(image2d_t image, int2 
byte_coord, ushort4 data);
+OVERLOADABLE void intel_sub_group_block_write_us8(image2d_t image, int2 
byte_coord, ushort8 data);
diff --git a/backend/src/llvm/llvm_gen_backend.cpp 
b/backend/src/llvm/llvm_gen_backend.cpp
index 43c7c4c..a6a249d 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -700,8 +700,8 @@ namespace gbe
     // Emit subgroup instructions
     void emitSubGroupInst(CallInst &I, CallSite &CS, ir::WorkGroupOps opcode);
     // Emit subgroup instructions
-    void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, 
uint8_t vec_size);
-    void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, 
uint8_t vec_size);
+    void emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool isWrite, 
uint8_t vec_size, ir::Type = ir::TYPE_U32);
+    void emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool isWrite, 
uint8_t vec_size, ir::Type = ir::TYPE_U32);
 
     uint8_t appendSampler(CallSite::arg_iterator AI);
     uint8_t getImageID(CallInst &I);
@@ -3853,14 +3853,22 @@ namespace gbe
       case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MAX:
       case GEN_OCL_SUB_GROUP_SCAN_INCLUSIVE_MIN:
       case GEN_OCL_LRP:
-      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
-      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
-      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
-      case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
-      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
-      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
-      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
-      case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
         this->newRegister(&I);
         break;
       case GEN_OCL_PRINTF:
@@ -3877,14 +3885,22 @@ namespace gbe
       case GEN_OCL_CALC_TIMESTAMP:
       case GEN_OCL_STORE_PROFILING:
       case GEN_OCL_DEBUGWAIT:
-      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
-      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
-      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
-      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
-      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
-      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
-      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
-      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
+      case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
         break;
       case GEN_OCL_NOT_FOUND:
       default:
@@ -4077,7 +4093,7 @@ namespace gbe
     GBE_ASSERT(AI == AE);
   }
 
-  void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool 
isWrite, uint8_t vec_size) {
+  void GenWriter::emitBlockReadWriteMemInst(CallInst &I, CallSite &CS, bool 
isWrite, uint8_t vec_size, ir::Type type) {
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
     GBE_ASSERT(AI != AE);
@@ -4113,7 +4129,6 @@ namespace gbe
       ptr = pointer;
     }
 
-    ir::Type type = ir::TYPE_U32;
     GBE_ASSERT(AM != ir::AM_DynamicBti);
 
     if(isWrite){
@@ -4134,7 +4149,7 @@ namespace gbe
     GBE_ASSERT(AI == AE);
   }
 
-  void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool 
isWrite, uint8_t vec_size) {
+  void GenWriter::emitBlockReadWriteImageInst(CallInst &I, CallSite &CS, bool 
isWrite, uint8_t vec_size, ir::Type type) {
     CallSite::arg_iterator AI = CS.arg_begin();
     CallSite::arg_iterator AE = CS.arg_end();
     GBE_ASSERT(AI != AE);
@@ -4150,7 +4165,7 @@ namespace gbe
         srcTupleData.push_back(getRegister(*(AI), i));
       AI++;
       const ir::Tuple srctuple = ctx.arrayTuple(&srcTupleData[0], 2 + 
vec_size);
-      ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size);
+      ctx.MBWRITE(imageID, srctuple, 2 + vec_size, vec_size, type);
     } else {
       ir::Register src[2];
       src[0] = getRegister(*(AI++));
@@ -4160,7 +4175,7 @@ namespace gbe
         dstTupleData.push_back(getRegister(&I, i));
       const ir::Tuple srctuple = ctx.arrayTuple(src, 2);
       const ir::Tuple dsttuple = ctx.arrayTuple(&dstTupleData[0], vec_size);
-      ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2);
+      ctx.MBREAD(imageID, dsttuple, vec_size, srctuple, 2, type);
     }
 
     GBE_ASSERT(AI == AE);
@@ -4993,38 +5008,70 @@ namespace gbe
             ctx.LRP(ir::TYPE_FLOAT, dst, src0, src1, src2);
             break;
           }
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM:
             this->emitBlockReadWriteMemInst(I, CS, false, 1); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
             this->emitBlockReadWriteMemInst(I, CS, false, 2); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
             this->emitBlockReadWriteMemInst(I, CS, false, 4); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
             this->emitBlockReadWriteMemInst(I, CS, false, 8); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM:
             this->emitBlockReadWriteMemInst(I, CS, true, 1); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2:
             this->emitBlockReadWriteMemInst(I, CS, true, 2); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4:
             this->emitBlockReadWriteMemInst(I, CS, true, 4); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8:
             this->emitBlockReadWriteMemInst(I, CS, true, 8); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE:
             this->emitBlockReadWriteImageInst(I, CS, false, 1); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2:
             this->emitBlockReadWriteImageInst(I, CS, false, 2); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4:
             this->emitBlockReadWriteImageInst(I, CS, false, 4); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8:
             this->emitBlockReadWriteImageInst(I, CS, false, 8); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE:
             this->emitBlockReadWriteImageInst(I, CS, true, 1); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2:
             this->emitBlockReadWriteImageInst(I, CS, true, 2); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4:
             this->emitBlockReadWriteImageInst(I, CS, true, 4); break;
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8:
             this->emitBlockReadWriteImageInst(I, CS, true, 8); break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM:
+            this->emitBlockReadWriteMemInst(I, CS, false, 1, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2:
+            this->emitBlockReadWriteMemInst(I, CS, false, 2, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4:
+            this->emitBlockReadWriteMemInst(I, CS, false, 4, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8:
+            this->emitBlockReadWriteMemInst(I, CS, false, 8, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM:
+            this->emitBlockReadWriteMemInst(I, CS, true, 1, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2:
+            this->emitBlockReadWriteMemInst(I, CS, true, 2, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4:
+            this->emitBlockReadWriteMemInst(I, CS, true, 4, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8:
+            this->emitBlockReadWriteMemInst(I, CS, true, 8, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE:
+            this->emitBlockReadWriteImageInst(I, CS, false, 1, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
+            this->emitBlockReadWriteImageInst(I, CS, false, 2, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
+            this->emitBlockReadWriteImageInst(I, CS, false, 4, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
+            this->emitBlockReadWriteImageInst(I, CS, false, 8, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
+            this->emitBlockReadWriteImageInst(I, CS, true, 1, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
+            this->emitBlockReadWriteImageInst(I, CS, true, 2, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
+            this->emitBlockReadWriteImageInst(I, CS, true, 4, ir::TYPE_U16); 
break;
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
+            this->emitBlockReadWriteImageInst(I, CS, true, 8, ir::TYPE_U16); 
break;
           default: break;
         }
       }
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx 
b/backend/src/llvm/llvm_gen_ocl_function.hxx
index 48a72d1..8ab4373 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -217,22 +217,38 @@ DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_ADD, 
__gen_ocl_sub_group_scan_in
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MAX, 
__gen_ocl_sub_group_scan_inclusive_max)
 DECL_LLVM_GEN_FUNCTION(SUB_GROUP_SCAN_INCLUSIVE_MIN, 
__gen_ocl_sub_group_scan_inclusive_min)
 
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM, 
__gen_ocl_sub_group_block_read_mem)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM2, 
__gen_ocl_sub_group_block_read_mem2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM4, 
__gen_ocl_sub_group_block_read_mem4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_MEM8, 
__gen_ocl_sub_group_block_read_mem8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM, 
__gen_ocl_sub_group_block_write_mem)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM2, 
__gen_ocl_sub_group_block_write_mem2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM4, 
__gen_ocl_sub_group_block_write_mem4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_MEM8, 
__gen_ocl_sub_group_block_write_mem8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE, 
__gen_ocl_sub_group_block_read_image)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE2, 
__gen_ocl_sub_group_block_read_image2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE4, 
__gen_ocl_sub_group_block_read_image4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_IMAGE8, 
__gen_ocl_sub_group_block_read_image8)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE, 
__gen_ocl_sub_group_block_write_image)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE2, 
__gen_ocl_sub_group_block_write_image2)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE4, 
__gen_ocl_sub_group_block_write_image4)
-DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_IMAGE8, 
__gen_ocl_sub_group_block_write_image8)
-
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM, 
__gen_ocl_sub_group_block_read_ui_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM2, 
__gen_ocl_sub_group_block_read_ui_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM4, 
__gen_ocl_sub_group_block_read_ui_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_MEM8, 
__gen_ocl_sub_group_block_read_ui_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM, 
__gen_ocl_sub_group_block_write_ui_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM2, 
__gen_ocl_sub_group_block_write_ui_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM4, 
__gen_ocl_sub_group_block_write_ui_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_MEM8, 
__gen_ocl_sub_group_block_write_ui_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE, 
__gen_ocl_sub_group_block_read_ui_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE2, 
__gen_ocl_sub_group_block_read_ui_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE4, 
__gen_ocl_sub_group_block_read_ui_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_UI_IMAGE8, 
__gen_ocl_sub_group_block_read_ui_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE, 
__gen_ocl_sub_group_block_write_ui_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE2, 
__gen_ocl_sub_group_block_write_ui_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE4, 
__gen_ocl_sub_group_block_write_ui_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_UI_IMAGE8, 
__gen_ocl_sub_group_block_write_ui_image8)
+
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM, 
__gen_ocl_sub_group_block_read_us_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM2, 
__gen_ocl_sub_group_block_read_us_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM4, 
__gen_ocl_sub_group_block_read_us_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_MEM8, 
__gen_ocl_sub_group_block_read_us_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM, 
__gen_ocl_sub_group_block_write_us_mem)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM2, 
__gen_ocl_sub_group_block_write_us_mem2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM4, 
__gen_ocl_sub_group_block_write_us_mem4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_MEM8, 
__gen_ocl_sub_group_block_write_us_mem8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE, 
__gen_ocl_sub_group_block_read_us_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE2, 
__gen_ocl_sub_group_block_read_us_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE4, 
__gen_ocl_sub_group_block_read_us_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_READ_US_IMAGE8, 
__gen_ocl_sub_group_block_read_us_image8)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE, 
__gen_ocl_sub_group_block_write_us_image)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE2, 
__gen_ocl_sub_group_block_write_us_image2)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE4, 
__gen_ocl_sub_group_block_write_us_image4)
+DECL_LLVM_GEN_FUNCTION(SUB_GROUP_BLOCK_WRITE_US_IMAGE8, 
__gen_ocl_sub_group_block_write_us_image8)
 // common function
 DECL_LLVM_GEN_FUNCTION(LRP, __gen_ocl_lrp)
diff --git a/backend/src/llvm/llvm_scalarize.cpp 
b/backend/src/llvm/llvm_scalarize.cpp
index 615fb50..8850abb 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -682,10 +682,14 @@ namespace gbe {
             *CI = InsertToVector(call, *CI);
             break;
           }
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE:
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE2:
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE4:
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_IMAGE8:
           {
             ++CI;
             ++CI;
@@ -693,22 +697,32 @@ namespace gbe {
               *CI = InsertToVector(call, *CI);
             break;
           }
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM:
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM2:
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM4:
-          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_MEM8:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_UI_MEM8:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM2:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM4:
+          case GEN_OCL_SUB_GROUP_BLOCK_WRITE_US_MEM8:
           {
             if ((*CI)->getType()->isVectorTy())
               *CI = InsertToVector(call, *CI);
             break;
           }
           case GEN_OCL_VME:
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM2:
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM4:
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_MEM8:
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE2:
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE4:
-          case GEN_OCL_SUB_GROUP_BLOCK_READ_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_MEM8:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_UI_IMAGE8:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_MEM8:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE2:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE4:
+          case GEN_OCL_SUB_GROUP_BLOCK_READ_US_IMAGE8:
             setAppendPoint(call);
             extractFromVector(call);
             break;
-- 
2.7.4

_______________________________________________
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet

Reply via email to