Re: [Beignet] [PATCH] Fix build failure with CMRT enabled

2016-10-12 Thread Guo, Yejun
LGTM, thanks.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of 
Rebecca N. Palmer
Sent: Thursday, October 13, 2016 6:15 AM
To: beignet@lists.freedesktop.org
Subject: [Beignet] [PATCH] Fix build failure with CMRT enabled

2baff9c moved mem->magic to cl_base_object.
---
(Or should this be CL_OBJECT_IS_MEM(mem), i.e. also checking the reference 
count?)

--- a/src/cl_cmrt.cpp
+++ b/src/cl_cmrt.cpp
@@ -256,7 +256,7 @@ cl_int cmrt_set_kernel_arg(cl_kernel k,
 result = cmrt_kernel->SetKernelArg(index, sz, value);
   else {
 cl_mem mem = *(cl_mem*)value;
-if (mem->magic == CL_MAGIC_MEM_HEADER) {
+if (((cl_base_object)mem)->magic == CL_MAGIC_MEM_HEADER) {
   if (!CreateCmrtMemory(mem))
 return CL_INVALID_ARG_VALUE;
 

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH] Fix build failure with CMRT enabled

2016-10-12 Thread Rebecca N. Palmer
2baff9c moved mem->magic to cl_base_object.
---
(Or should this be CL_OBJECT_IS_MEM(mem), i.e. also checking the reference 
count?)

--- a/src/cl_cmrt.cpp
+++ b/src/cl_cmrt.cpp
@@ -256,7 +256,7 @@ cl_int cmrt_set_kernel_arg(cl_kernel k,
 result = cmrt_kernel->SetKernelArg(index, sz, value);
   else {
 cl_mem mem = *(cl_mem*)value;
-if (mem->magic == CL_MAGIC_MEM_HEADER) {
+if (((cl_base_object)mem)->magic == CL_MAGIC_MEM_HEADER) {
   if (!CreateCmrtMemory(mem))
 return CL_INVALID_ARG_VALUE;
 

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


Re: [Beignet] [PATCH v3] Utests: Allow testing cl_intel_accelerator via ICD

2016-10-12 Thread Weng, Chuanbo
Hi Rebecca,
This version LGTM except some points need to be minor refined. Just see 
my comments below.

-Original Message-
From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of 
Rebecca N. Palmer
Sent: Wednesday, October 12, 2016 5:50 AM
To: Weng, Chuanbo ; beignet@lists.freedesktop.org
Subject: Re: [Beignet] [PATCH v3] Utests: Allow testing cl_intel_accelerator 
via ICD

v3: Use extension check, not beignet check.  Treat claiming to have the 
extension but not having the kernel as a failure.

---
(v2 was the un-numbered 10/10/16 08:07 version...which I subsequently noticed 
was broken in that it assumed a non-NULL 
clGetExtensionFunctionAddressForPlatform result meant the extension was 
supported, which it doesn't in general, 
https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/clGetExtensionFunctionAddressForPlatform.html
 )

--- a/utests/builtin_kernel_block_motion_estimate_intel.cpp
+++ b/utests/builtin_kernel_block_motion_estimate_intel.cpp
@@ -8,6 +8,19 @@ OCLRELEASEACCELERATORINTEL * oclReleaseA
 
 void builtin_kernel_block_motion_estimate_intel(void)
 {
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, 
+ _value_size);  std::vector param_value(param_value_size);  
+ OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+   param_value.empty() ? NULL : _value.front(), 
+ _value_size);  if (!param_value.empty())
+extStr = std::string(_value.front(), param_value_size-1);  // 
+ cl_intel_motion_estimation depends on cl_intel_accelerator, so we only 
+ need to check one  if (strstr(extStr.c_str(), "cl_intel_motion_estimation") 
== NULL) {
+printf("No cl_intel_motion_estimation, Skip!");
+return;
+  }
[Chuanbo] It would be better if you wrapper this part of code into 
cl_check_motion_estimation() and then move
it to utest_helper.cpp. This will keep existing code organization style.
There is a bug in Beignet: cl_intel_motion_estimation is supported by IVB only, 
but all devices show string cl_intel_motion_estimation
in their CL_DEVICE_EXTENSIONS. I'll work out a patch to fix this problem.

   char* built_in_kernel_names;
   size_t built_in_kernels_size;
   cl_int err = CL_SUCCESS;
@@ -21,7 +34,8 @@ void builtin_kernel_block_motion_estimat
   if (strstr(built_in_kernel_names, "block_motion_estimate_intel") == NULL)
   {
 free(built_in_kernel_names);
-return;
+printf("Can't find block_motion_estimate_intel built-in kernel");
[Chuanbo] Although I know there are somewhere else using printf instead 
of fprintf(stderr, ...), let's keep in mind that
we should better use fprintf(stderr, ...) for output of error info.
+OCL_ASSERT(0);
   }
 
   cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, 
, built_in_kernel_names, );
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -287,7 +287,8 @@ set (utests_sources
   multi_queue_events.cpp
   compiler_mix.cpp
   compiler_math_3op.cpp
-  compiler_bsort.cpp)
+  compiler_bsort.cpp
+  builtin_kernel_block_motion_estimate_intel.cpp)
 
 if (LLVM_VERSION_NODOT VERSION_GREATER 34)
   SET(utests_sources
@@ -328,7 +329,6 @@ else(GEN_PCI_ID)
 endif(GEN_PCI_ID)
 
 if (NOT_BUILD_STAND_ALONE_UTEST)
-  SET(utests_sources ${utests_sources} 
builtin_kernel_block_motion_estimate_intel.cpp)
   ADD_CUSTOM_TARGET(kernel_bin.bin DEPENDS ${kernel_bin}.bin)  endif 
(NOT_BUILD_STAND_ALONE_UTEST)
 

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet
___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 10/14] Utest: Add test case for sub group short builtin functions

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Signed-off-by: Pan Xiuli 
---
 kernels/compiler_subgroup_reduce.cl | 22 ++
 kernels/compiler_subgroup_scan_exclusive.cl | 36 
 kernels/compiler_subgroup_scan_inclusive.cl | 36 
 utests/compiler_subgroup_reduce.cpp | 66 +
 utests/compiler_subgroup_scan_exclusive.cpp | 66 +
 utests/compiler_subgroup_scan_inclusive.cpp | 66 +
 6 files changed, 292 insertions(+)

diff --git a/kernels/compiler_subgroup_reduce.cl 
b/kernels/compiler_subgroup_reduce.cl
index 6d7ecfd..79d8e7d 100644
--- a/kernels/compiler_subgroup_reduce.cl
+++ b/kernels/compiler_subgroup_reduce.cl
@@ -73,6 +73,17 @@ kernel void compiler_subgroup_reduce_add_float(global float 
*src, global float *
 /*
  * Subgroup reduce max functions
  */
+kernel void compiler_subgroup_reduce_max_short(global short *src, global short 
*dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_reduce_max_ushort(global ushort *src, global 
ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  //printf("src is %d\n",val);
+  ushort sum = sub_group_reduce_max(val);
+  dst[get_global_id(0)] = sum;
+}
 kernel void compiler_subgroup_reduce_max_int(global int *src, global int *dst) 
{
   int val = src[get_global_id(0)];
   int sum = sub_group_reduce_max(val);
@@ -106,6 +117,17 @@ kernel void compiler_subgroup_reduce_max_float(global 
float *src, global float *
 /*
  * Subgroup reduce min functions
  */
+kernel void compiler_subgroup_reduce_min_short(global short *src, global short 
*dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
+kernel void compiler_subgroup_reduce_min_ushort(global ushort *src, global 
ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  //printf("src is %d\n",val);
+  ushort sum = sub_group_reduce_min(val);
+  dst[get_global_id(0)] = sum;
+}
 kernel void compiler_subgroup_reduce_min_int(global int *src, global int *dst) 
{
   int val = src[get_global_id(0)];
   int sum = sub_group_reduce_min(val);
diff --git a/kernels/compiler_subgroup_scan_exclusive.cl 
b/kernels/compiler_subgroup_scan_exclusive.cl
index ca0ada2..2c4b928 100644
--- a/kernels/compiler_subgroup_scan_exclusive.cl
+++ b/kernels/compiler_subgroup_scan_exclusive.cl
@@ -2,6 +2,18 @@
  * Subgroup scan exclusive add functions
  */
 #ifndef HALF
+kernel void compiler_subgroup_scan_exclusive_add_short(global short *src, 
global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_add_ushort(global ushort *src, 
global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_scan_exclusive_add(val);
+  dst[get_global_id(0)] = sum;
+}
+
 kernel void compiler_subgroup_scan_exclusive_add_int(global int *src, global 
int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_scan_exclusive_add(val);
@@ -35,6 +47,18 @@ kernel void 
compiler_subgroup_scan_exclusive_add_float(global float *src, global
 /*
  * Subgroup scan exclusive max functions
  */
+kernel void compiler_subgroup_scan_exclusive_max_short(global short *src, 
global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_max_ushort(global ushort *src, 
global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_scan_exclusive_max(val);
+  dst[get_global_id(0)] = sum;
+}
+
 kernel void compiler_subgroup_scan_exclusive_max_int(global int *src, global 
int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_scan_exclusive_max(val);
@@ -68,6 +92,18 @@ kernel void 
compiler_subgroup_scan_exclusive_max_float(global float *src, global
 /*
  * Subgroup scan exclusive min functions
  */
+kernel void compiler_subgroup_scan_exclusive_min_short(global short *src, 
global short *dst) {
+  short val = src[get_global_id(0)];
+  short sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
+kernel void compiler_subgroup_scan_exclusive_min_ushort(global ushort *src, 
global ushort *dst) {
+  ushort val = src[get_global_id(0)];
+  ushort sum = sub_group_scan_exclusive_min(val);
+  dst[get_global_id(0)] = sum;
+}
+
 kernel void compiler_subgroup_scan_exclusive_min_int(global int *src, global 
int *dst) {
   int val = src[get_global_id(0)];
   int sum = sub_group_scan_exclusive_min(val);
diff --git a/kernels/compiler_subgroup_scan_inclusive.cl 
b/kernels/compiler_subgroup_scan_inclusive.cl
index e97521c..def941c 100644
--- a/kernels/compiler_subgroup_scan_inclusive.cl
+++ b/kernels/compiler_subgroup_scan_inclusive.cl
@@ -2,6 +2,18 

[Beignet] [PATCH 06/14] Utest: Add check subgroup short helper function

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Check if the device support intel_subgroups_short extension, also check
if the device support intel_subgroups extension first.

Signed-off-by: Pan Xiuli 
---
 utests/utest_helper.cpp | 20 
 utests/utest_helper.hpp |  2 ++
 2 files changed, 22 insertions(+)

diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index d3fc069..f1f5af4 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -899,6 +899,26 @@ int cl_check_subgroups(void)
   return 1;
 }
 
+int cl_check_subgroups_short(void)
+{
+  if (!cl_check_subgroups())
+return 0;
+  std::string extStr;
+  size_t param_value_size;
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, 
_value_size);
+  std::vector param_value(param_value_size);
+  OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size,
+   param_value.empty() ? NULL : _value.front(), 
_value_size);
+  if (!param_value.empty())
+extStr = std::string(_value.front(), param_value_size-1);
+
+  if (std::strstr(extStr.c_str(), "cl_intel_subgroups_short") == NULL) {
+printf("No cl_intel_subgroups_short, Skip!");
+return 0;
+  }
+  return 1;
+}
+
 int cl_check_ocl20(void)
 {
   size_t param_value_size;
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index 034a411..0f4a1ef 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -318,4 +318,6 @@ extern uint32_t __half_to_float(uint16_t h, bool* isInf = 
NULL, bool* infSign =
 extern uint16_t __float_to_half(uint32_t x);
 extern float as_float(uint32_t i);
 extern uint32_t as_uint(float f);
+/* Check is intel subgroups short enabled. */
+extern int cl_check_subgroups_short(void);
 #endif /* __UTEST_HELPER_HPP__ */
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 03/14] Backend: Refine register offset for simd shuffle

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Simd shuffle should support different type, we used to support float or
dword type. Now we can set offset by src type.

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen_context.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index 4f73237..e907931 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -719,7 +719,7 @@ namespace gbe
 
 p->curr.quarterControl = 1;
 p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / 
typeSize(GEN_TYPE_UW)), baseReg);
-p->MOV(GenRegister::offset(dst, 1, 0), indirect);
+p->MOV(GenRegister::offset(dst, 0, 8 * typeSize(src0.type)), indirect);
   } else
 NOT_IMPLEMENTED;
 p->pop();
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 05/14] Libocl: Add sub group broadcast short builtin function

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Add sub group broadcast and intel sub group broadcast for short type.

Signed-off-by: Pan Xiuli 
---
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 9 +
 backend/src/libocl/tmpl/ocl_simd.tmpl.h  | 4 
 2 files changed, 13 insertions(+)

diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl 
b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 245ce8a..d1bcfa3 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -50,8 +50,17 @@ BROADCAST_IMPL(ulong)
 BROADCAST_IMPL(half)
 BROADCAST_IMPL(float)
 BROADCAST_IMPL(double)
+BROADCAST_IMPL(short)
+BROADCAST_IMPL(ushort)
 #undef BROADCAST_IMPL
 
+OVERLOADABLE short intel_sub_group_broadcast(short a, uint local_id) {
+  return __gen_ocl_sub_group_broadcast(a, local_id);
+}
+
+OVERLOADABLE ushort intel_sub_group_broadcast(ushort a, uint local_id) {
+  return __gen_ocl_sub_group_broadcast(a, local_id);
+}
 
 #define RANGE_OP(RANGE, OP, GEN_TYPE, SIGN) \
 OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_##RANGE##_##OP(bool sign, 
GEN_TYPE x); \
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h 
b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index e8dc6f4..c609c2e 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -42,7 +42,11 @@ OVERLOADABLE ulong sub_group_broadcast(ulong a, uint 
local_id);
 OVERLOADABLE half sub_group_broadcast(half a, uint local_id);
 OVERLOADABLE float sub_group_broadcast(float a, uint local_id);
 OVERLOADABLE double sub_group_broadcast(double a, uint local_id);
+OVERLOADABLE short sub_group_broadcast(short a,uint local_id);
+OVERLOADABLE ushort sub_group_broadcast(ushort a, uint local_id);
 
+OVERLOADABLE short intel_sub_group_broadcast(short a, uint local_id);
+OVERLOADABLE ushort intel_sub_group_broadcast(ushort a, uint local_id);
 /* reduce add */
 OVERLOADABLE int sub_group_reduce_add(int x);
 OVERLOADABLE uint sub_group_reduce_add(uint x);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 13/14] Backend: Add subgroup short block read/write

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Add intel subgroup short mem bleck read/write and image block read/write
also fix some old block read/write bug.
Refine old uint block read/write with _ui suffix.

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen_context.cpp| 190 +
 backend/src/backend/gen_encoder.cpp|  26 +++-
 backend/src/backend/gen_insn_selection.cpp |  37 +++--
 backend/src/ir/instruction.cpp |  26 ++--
 backend/src/ir/instruction.hpp |   6 +-
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl   | 221 -
 backend/src/libocl/tmpl/ocl_simd.tmpl.h|  48 ++-
 backend/src/llvm/llvm_gen_backend.cpp  | 125 +++-
 backend/src/llvm/llvm_gen_ocl_function.hxx |  50 ---
 backend/src/llvm/llvm_scalarize.cpp|  42 --
 10 files changed, 573 insertions(+), 198 deletions(-)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index a1ae5ea..6bb0f22 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -3501,12 +3501,14 @@ namespace gbe
   }
 
   void GenContext::emitOBReadInstruction(const SelectionInstruction ) {
-const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)), 
GEN_TYPE_UD);
+const GenRegister dst= ra->genReg(insn.dst(1));
+uint32_t type = dst.type;
+uint32_t typesize = typeSize(type);
 const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), 
GEN_TYPE_UD);
 const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), 
GEN_TYPE_UD);
 const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4);
 const uint32_t vec_size = insn.extra.elem;
-const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + 
vec_size)), GEN_TYPE_UD);
+const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + 
vec_size)), type);
 const uint32_t simdWidth = p->curr.execWidth;
 
 // Make header
@@ -3532,7 +3534,7 @@ namespace gbe
   {
 p->curr.execWidth = 16;
 p->curr.noMask = 1;
-p->OBREAD(dst, header, insn.getbti(), simdWidth / 4);
+p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16);
   }
   p->pop();
 } else if (vec_size == 2) {
@@ -3540,14 +3542,41 @@ namespace gbe
   {
 p->curr.execWidth = 16;
 p->curr.noMask = 1;
-p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2);
+p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8);
   }
   p->pop();
   p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0));
-  p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth / 8));
-} else if (vec_size == 4 || vec_size == 8) {
+  p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * 
typesize ));
+} else if (vec_size == 4) {
   if (simdWidth == 8) {
-for (uint32_t i = 0; i < vec_size / 4; i++) {
+p->push();
+{
+  p->curr.execWidth = 16;
+  p->curr.noMask = 1;
+  p->OBREAD(tmp, header, insn.getbti(), 2 * typesize);
+}
+p->pop();
+for (uint32_t j = 0; j < 4; j++)
+  p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * 
simdWidth * typesize ));
+  } else {
+for (uint32_t i = 0; i < typesize / 2; i++) {
+  if (i > 0) {
+p->push();
+{
+  // Update the address in header
+  p->curr.execWidth = 1;
+  p->ADD(headeraddr, headeraddr, GenRegister::immud(128));
+}
+p->pop();
+  }
+  p->OBREAD(tmp, header, insn.getbti(), 8);
+  for (uint32_t j = 0; j < 8 / typesize ; j++)
+p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), 
GenRegister::offset(tmp, 0 ,j * simdWidth * typesize ));
+}
+  }
+} else if (vec_size == 8) {
+  if (simdWidth == 8) {
+for (uint32_t i = 0; i < typesize / 2; i++) {
   if (i > 0) {
 p->push();
 {
@@ -3564,11 +3593,11 @@ namespace gbe
 p->OBREAD(tmp, header, insn.getbti(), 8);
   }
   p->pop();
-  for (uint32_t j = 0; j < 4; j++)
-p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), 
GenRegister::offset(tmp, j));
+  for (uint32_t j = 0; j < 16 / typesize; j++)
+p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), 
GenRegister::offset(tmp, 0, j * simdWidth * typesize ));
 }
   } else {
-for (uint32_t i = 0; i < vec_size / 2; i++) {
+for (uint32_t i = 0; i < typesize ; i++) {
   if (i > 0) {
 p->push();
 {
@@ -3579,8 +3608,8 @@ namespace gbe
 p->pop();
   }
   p->OBREAD(tmp, header, insn.getbti(), 8);
-  for (uint32_t j = 0; j < 2; j++)
-p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), 
GenRegister::offset(tmp, 

[Beignet] [PATCH 08/14] Backend: Change the sel ir optimization for unpack register

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

To unpack UW we may need to add mov and we do not want this mov to be
optimizated by the sel ir optimization. Add check for hstrid to avoid
this kind optimization.

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen_insn_selection_optimize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/src/backend/gen_insn_selection_optimize.cpp 
b/backend/src/backend/gen_insn_selection_optimize.cpp
index b8aa776..56c7615 100644
--- a/backend/src/backend/gen_insn_selection_optimize.cpp
+++ b/backend/src/backend/gen_insn_selection_optimize.cpp
@@ -161,7 +161,7 @@ namespace gbe
 assert(insn.opcode == SEL_OP_MOV);
 const GenRegister& src = insn.src(0);
 const GenRegister& dst = insn.dst(0);
-if (src.type != dst.type || src.file != dst.file)
+if (src.type != dst.type || src.file != dst.file || src.hstride != 
dst.hstride)
   return;
 
 if (liveout.find(dst.reg()) != liveout.end())
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 09/14] Backend: Add short sub group builtin functions

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Add intel sub group short type builtins.

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen_context.cpp  | 12 +
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 44 
 backend/src/libocl/tmpl/ocl_simd.tmpl.h  | 36 ++
 3 files changed, 92 insertions(+)

diff --git a/backend/src/backend/gen_context.cpp 
b/backend/src/backend/gen_context.cpp
index e907931..a1ae5ea 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -2878,6 +2878,10 @@ namespace gbe
 p->MOV(dataReg, GenRegister::immint64(0x0));
   else if (dataReg.type == GEN_TYPE_UL)
 p->MOV(dataReg, GenRegister::immuint64(0x0));
+  else if (dataReg.type == GEN_TYPE_W)
+p->MOV(dataReg, GenRegister::immw(0x0));
+  else if (dataReg.type == GEN_TYPE_UW)
+p->MOV(dataReg, GenRegister::immuw(0x0));
   else
 GBE_ASSERT(0); /* unsupported data-type */
 }
@@ -2896,6 +2900,10 @@ namespace gbe
 p->MOV(dataReg, GenRegister::immint64(0x7FFFL));
   else if (dataReg.type == GEN_TYPE_UL)
 p->MOV(dataReg, GenRegister::immuint64(0xL));
+  else if (dataReg.type == GEN_TYPE_W)
+p->MOV(dataReg, GenRegister::immw(0x7FFF));
+  else if (dataReg.type == GEN_TYPE_UW)
+p->MOV(dataReg, GenRegister::immuw(0x));
   else
 GBE_ASSERT(0); /* unsupported data-type */
 }
@@ -2914,6 +2922,10 @@ namespace gbe
 p->MOV(dataReg, GenRegister::immint64(0x8000L));
   else if (dataReg.type == GEN_TYPE_UL)
 p->MOV(dataReg, GenRegister::immuint64(0x0));
+  else if (dataReg.type == GEN_TYPE_W)
+p->MOV(dataReg, GenRegister::immw(0x8000));
+  else if (dataReg.type == GEN_TYPE_UW)
+p->MOV(dataReg, GenRegister::immuw(0x0));
   else
 GBE_ASSERT(0); /* unsupported data-type */
 }
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl 
b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index d1bcfa3..90c7cc2 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -76,6 +76,8 @@ RANGE_OP(reduce, add, ulong, false)
 RANGE_OP(reduce, add, half, true)
 RANGE_OP(reduce, add, float, true)
 RANGE_OP(reduce, add, double, true)
+RANGE_OP(reduce, add, short, true)
+RANGE_OP(reduce, add, ushort, false)
 /* reduce min */
 RANGE_OP(reduce, min, int, true)
 RANGE_OP(reduce, min, uint, false)
@@ -84,6 +86,8 @@ RANGE_OP(reduce, min, ulong, false)
 RANGE_OP(reduce, min, half, true)
 RANGE_OP(reduce, min, float, true)
 RANGE_OP(reduce, min, double, true)
+RANGE_OP(reduce, min, short, true)
+RANGE_OP(reduce, min, ushort, false)
 /* reduce max */
 RANGE_OP(reduce, max, int, true)
 RANGE_OP(reduce, max, uint, false)
@@ -92,6 +96,8 @@ RANGE_OP(reduce, max, ulong, false)
 RANGE_OP(reduce, max, half, true)
 RANGE_OP(reduce, max, float, true)
 RANGE_OP(reduce, max, double, true)
+RANGE_OP(reduce, max, short, true)
+RANGE_OP(reduce, max, ushort, false)
 
 /* scan_inclusive add */
 RANGE_OP(scan_inclusive, add, int, true)
@@ -101,6 +107,8 @@ RANGE_OP(scan_inclusive, add, ulong, false)
 RANGE_OP(scan_inclusive, add, half, true)
 RANGE_OP(scan_inclusive, add, float, true)
 RANGE_OP(scan_inclusive, add, double, true)
+RANGE_OP(scan_inclusive, add, short, true)
+RANGE_OP(scan_inclusive, add, ushort, false)
 /* scan_inclusive min */
 RANGE_OP(scan_inclusive, min, int, true)
 RANGE_OP(scan_inclusive, min, uint, false)
@@ -109,6 +117,8 @@ RANGE_OP(scan_inclusive, min, ulong, false)
 RANGE_OP(scan_inclusive, min, half, true)
 RANGE_OP(scan_inclusive, min, float, true)
 RANGE_OP(scan_inclusive, min, double, true)
+RANGE_OP(scan_inclusive, min, short, true)
+RANGE_OP(scan_inclusive, min, ushort, false)
 /* scan_inclusive max */
 RANGE_OP(scan_inclusive, max, int, true)
 RANGE_OP(scan_inclusive, max, uint, false)
@@ -117,6 +127,8 @@ RANGE_OP(scan_inclusive, max, ulong, false)
 RANGE_OP(scan_inclusive, max, half, true)
 RANGE_OP(scan_inclusive, max, float, true)
 RANGE_OP(scan_inclusive, max, double, true)
+RANGE_OP(scan_inclusive, max, short, true)
+RANGE_OP(scan_inclusive, max, ushort, false)
 
 /* scan_exclusive add */
 RANGE_OP(scan_exclusive, add, int, true)
@@ -126,6 +138,8 @@ RANGE_OP(scan_exclusive, add, ulong, false)
 RANGE_OP(scan_exclusive, add, half, true)
 RANGE_OP(scan_exclusive, add, float, true)
 RANGE_OP(scan_exclusive, add, double, true)
+RANGE_OP(scan_exclusive, add, short, true)
+RANGE_OP(scan_exclusive, add, ushort, false)
 /* scan_exclusive min */
 RANGE_OP(scan_exclusive, min, int, true)
 RANGE_OP(scan_exclusive, min, uint, false)
@@ -134,6 +148,8 @@ RANGE_OP(scan_exclusive, min, ulong, false)
 RANGE_OP(scan_exclusive, min, half, true)
 RANGE_OP(scan_exclusive, min, float, true)
 RANGE_OP(scan_exclusive, min, double, true)
+RANGE_OP(scan_exclusive, min, short, true)
+RANGE_OP(scan_exclusive, min, ushort, 

[Beignet] [PATCH 01/14] Libocl: Add intel_subgroups_short extension

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

We support intel_subgroups_short extension.

Signed-off-by: Pan Xiuli 
---
 backend/src/libocl/include/ocl.h | 1 +
 src/cl_extensions.h  | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
index 5e3a788..6230b93 100644
--- a/backend/src/libocl/include/ocl.h
+++ b/backend/src/libocl/include/ocl.h
@@ -114,6 +114,7 @@
 #define cl_khr_fp16
 #define cl_khr_3d_image_writes
 #define cl_intel_subgroups
+#define cl_intel_subgroups_short
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : disable
 #pragma OPENCL EXTENSION cl_khr_fp16 : disable
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 1139775..c32e085 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -28,7 +28,8 @@
 #define DECL_INTEL_EXTENSIONS \
   DECL_EXT(intel_accelerator) \
   DECL_EXT(intel_motion_estimation) \
-  DECL_EXT(intel_subgroups)
+  DECL_EXT(intel_subgroups) \
+  DECL_EXT(intel_subgroups_short)
 
 #define DECL_GL_EXTENSIONS \
   DECL_EXT(khr_gl_sharing)\
@@ -63,7 +64,7 @@ cl_khr_extension_id_max
 #define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics)
 #define OPT1_EXT_END_ID EXT_ID(khr_icd)
 #define INTEL_EXT_START_ID EXT_ID(intel_accelerator)
-#define INTEL_EXT_END_ID EXT_ID(intel_subgroups)
+#define INTEL_EXT_END_ID EXT_ID(intel_subgroups_short)
 #define GL_EXT_START_ID EXT_ID(khr_gl_sharing)
 #define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing)
 
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 12/14] Utest: Add test case for short type sub group shuffle

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Signed-off-by: Pan Xiuli 
---
 kernels/compiler_sub_group_shuffle.cl  | 22 +++-
 kernels/compiler_sub_group_shuffle_down.cl | 23 -
 kernels/compiler_sub_group_shuffle_up.cl   | 23 -
 kernels/compiler_sub_group_shuffle_xor.cl  | 23 -
 utests/compiler_sub_group_shuffle.cpp  | 52 ++--
 utests/compiler_sub_group_shuffle_down.cpp | 54 --
 utests/compiler_sub_group_shuffle_up.cpp   | 54 --
 utests/compiler_sub_group_shuffle_xor.cpp  | 54 --
 8 files changed, 289 insertions(+), 16 deletions(-)

diff --git a/kernels/compiler_sub_group_shuffle.cl 
b/kernels/compiler_sub_group_shuffle.cl
index 322da74..c771eea 100644
--- a/kernels/compiler_sub_group_shuffle.cl
+++ b/kernels/compiler_sub_group_shuffle.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_int(global int *dst, int c)
 {
   int i = get_global_id(0);
   if (i == 0)
@@ -16,3 +16,23 @@ __kernel void compiler_sub_group_shuffle(global int *dst, 
int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
 }
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_short(global short *dst, int c)
+{
+  short i = get_global_id(0);
+  if (i == 0)
+dst[0] = get_max_sub_group_size();
+  dst++;
+
+  short from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  short o0 = get_sub_group_local_id();
+  short o1 = intel_sub_group_shuffle(from, c);
+  short o2 = intel_sub_group_shuffle(from, 5);
+  short o3 = intel_sub_group_shuffle(from, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_sub_group_shuffle_down.cl 
b/kernels/compiler_sub_group_shuffle_down.cl
index 769fc3f..40bac05 100644
--- a/kernels/compiler_sub_group_shuffle_down.cl
+++ b/kernels/compiler_sub_group_shuffle_down.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_down(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_down_int(global int *dst, int c)
 {
   int i = get_global_id(0);
   if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_down(global int 
*dst, int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
 }
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_down_short(global short *dst, int c)
+{
+  short i = get_global_id(0);
+  if (i == 0)
+dst[0] = get_max_sub_group_size();
+  dst++;
+
+  short from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int k = get_sub_group_local_id() + 1;
+  short o0 = intel_sub_group_shuffle_down((short)123, (short)456, c);
+  short o1 = intel_sub_group_shuffle_down((short)123, from, c);
+  short o2 = intel_sub_group_shuffle_down(from, (short)-from, k);
+  short o3 = intel_sub_group_shuffle_down(from, (short)321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_sub_group_shuffle_up.cl 
b/kernels/compiler_sub_group_shuffle_up.cl
index 5c5cee1..fd287d5 100644
--- a/kernels/compiler_sub_group_shuffle_up.cl
+++ b/kernels/compiler_sub_group_shuffle_up.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_up(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_up_int(global int *dst, int c)
 {
   int i = get_global_id(0);
   if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_up(global int *dst, 
int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
 }
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_up_short(global short *dst, int c)
+{
+  short i = get_global_id(0);
+  if (i == 0)
+dst[0] = get_max_sub_group_size();
+  dst++;
+
+  short from = i;
+  int j = get_sub_group_local_id() + 1;
+  int k = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  short o0 = intel_sub_group_shuffle_up((short)123, (short)456, c);
+  short o1 = intel_sub_group_shuffle_up((short)123, from, c);
+  short o2 = intel_sub_group_shuffle_up(from, (short)-from, k);
+  short o3 = intel_sub_group_shuffle_up(from, (short)321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_sub_group_shuffle_xor.cl 
b/kernels/compiler_sub_group_shuffle_xor.cl
index 8bc15d3..df3dfe7 100644
--- a/kernels/compiler_sub_group_shuffle_xor.cl
+++ b/kernels/compiler_sub_group_shuffle_xor.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_xor(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_xor_int(global int *dst, int c)
 {
   int i = get_global_id(0);
   if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_xor(global int 
*dst, int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
 }
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_xor_short(global short *dst, int c)
+{
+  short i = get_global_id(0);
+  if (i == 0)
+dst[0] = 

[Beignet] [PATCH 14/14] Utest: Add subgroup block read/write ushort test case

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Add ushort block read/write for buffer and image.
Refine uint block read/write with suffix _ui.

Signed-off-by: Pan Xiuli 
---
 kernels/compiler_subgroup_buffer_block_read.cl  | 47 ++--
 kernels/compiler_subgroup_buffer_block_write.cl | 44 +--
 kernels/compiler_subgroup_image_block_read.cl   | 49 +++--
 kernels/compiler_subgroup_image_block_write.cl  | 46 +---
 utests/compiler_subgroup_buffer_block_read.cpp  | 73 +++---
 utests/compiler_subgroup_buffer_block_write.cpp | 74 ---
 utests/compiler_subgroup_image_block_read.cpp   | 98 +++--
 utests/compiler_subgroup_image_block_write.cpp  | 73 +++---
 8 files changed, 412 insertions(+), 92 deletions(-)

diff --git a/kernels/compiler_subgroup_buffer_block_read.cl 
b/kernels/compiler_subgroup_buffer_block_read.cl
index 9edaa2e..4cbf894 100644
--- a/kernels/compiler_subgroup_buffer_block_read.cl
+++ b/kernels/compiler_subgroup_buffer_block_read.cl
@@ -1,31 +1,62 @@
-__kernel void compiler_subgroup_buffer_block_read1(global uint *src, global 
uint *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui1(global uint *src, global 
uint *dst)
 {
   int id = get_global_id(0);
   global uint * p = src + get_sub_group_id() * get_max_sub_group_size();
-  uint tmp = intel_sub_group_block_read(p);
+  uint tmp = intel_sub_group_block_read_ui(p);
   dst[id] = tmp;
 }
 
-__kernel void compiler_subgroup_buffer_block_read2(global uint *src, global 
uint2 *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui2(global uint *src, global 
uint2 *dst)
 {
   int id = get_global_id(0);
   global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*2;
-  uint2 tmp = intel_sub_group_block_read2(p);
+  uint2 tmp = intel_sub_group_block_read_ui2(p);
   dst[id] = tmp;
 }
 
-__kernel void compiler_subgroup_buffer_block_read4(global uint *src, global 
uint4 *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui4(global uint *src, global 
uint4 *dst)
 {
   int id = get_global_id(0);
   global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*4;
-  uint4 tmp = intel_sub_group_block_read4(p);
+  uint4 tmp = intel_sub_group_block_read_ui4(p);
   dst[id] = tmp;
 }
 
-__kernel void compiler_subgroup_buffer_block_read8(global uint *src, global 
uint8 *dst)
+__kernel void compiler_subgroup_buffer_block_read_ui8(global uint *src, global 
uint8 *dst)
 {
   int id = get_global_id(0);
   global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*8;
-  uint8 tmp = intel_sub_group_block_read8(p);
+  uint8 tmp = intel_sub_group_block_read_ui8(p);
   dst[id] = tmp;
 }
+#ifdef SHORT
+__kernel void compiler_subgroup_buffer_block_read_us1(global ushort *src, 
global ushort *dst)
+{
+  int id = get_global_id(0);
+  global ushort * p = src + get_sub_group_id() * get_max_sub_group_size();
+  ushort tmp = intel_sub_group_block_read_us(p);
+  dst[id] = tmp;
+}
+__kernel void compiler_subgroup_buffer_block_read_us2(global ushort *src, 
global ushort2 *dst)
+{
+  int id = get_global_id(0);
+  global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*2;
+  ushort2 tmp = intel_sub_group_block_read_us2(p);
+  dst[id] = tmp;
+}
+__kernel void compiler_subgroup_buffer_block_read_us4(global ushort *src, 
global ushort4 *dst)
+{
+  int id = get_global_id(0);
+  global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*4;
+  ushort4 tmp = intel_sub_group_block_read_us4(p);
+  dst[id] = tmp;
+}
+
+__kernel void compiler_subgroup_buffer_block_read_us8(global ushort *src, 
global ushort8 *dst)
+{
+  int id = get_global_id(0);
+  global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*8;
+  ushort8 tmp = intel_sub_group_block_read_us8(p);
+  dst[id] = tmp;
+}
+#endif
diff --git a/kernels/compiler_subgroup_buffer_block_write.cl 
b/kernels/compiler_subgroup_buffer_block_write.cl
index f735855..f452dcc 100644
--- a/kernels/compiler_subgroup_buffer_block_write.cl
+++ b/kernels/compiler_subgroup_buffer_block_write.cl
@@ -1,27 +1,55 @@
-__kernel void compiler_subgroup_buffer_block_write1(global uint *src, global 
uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui1(global uint *src, 
global uint *dst)
 {
   int id = get_global_id(0);
   global uint * p = dst + get_sub_group_id() * get_max_sub_group_size();
-  intel_sub_group_block_write(p,src[id]);
+  intel_sub_group_block_write_ui(p,src[id]);
 }
 
-__kernel void compiler_subgroup_buffer_block_write2(global uint2 *src, global 
uint *dst)
+__kernel void compiler_subgroup_buffer_block_write_ui2(global uint2 *src, 
global uint *dst)
 {
   int id = get_global_id(0);
   global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*2;
-  intel_sub_group_block_write2(p,src[id]);
+  intel_sub_group_block_write_ui2(p,src[id]);
 }
 
-__kernel void compiler_subgroup_buffer_block_write4(global uint4 *src, global 
uint *dst)
+__kernel 

[Beignet] [PATCH 11/14] Backend: Add sub groups short shuffle builtin functions

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Add short type sub group shuffle(simd shuffle)

Signed-off-by: Pan Xiuli 
---
 backend/src/ir/instruction.cpp   |  5 +++--
 backend/src/libocl/tmpl/ocl_simd.tmpl.cl |  6 ++
 backend/src/libocl/tmpl/ocl_simd.tmpl.h  | 11 +++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index ed64580..08a94cd 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -1493,8 +1493,9 @@ namespace ir {
 
 INLINE bool SimdShuffleInstruction::wellFormed(const Function , 
std::string ) const
 {
-  if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32 && 
this->type != TYPE_FLOAT)) {
-whyNot = "Only support S32/U32/FLOAT type";
+  if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32 && 
this->type != TYPE_FLOAT &&
+this->type != TYPE_U16 && this->type != TYPE_S16)) {
+whyNot = "Only support S16/U16/S32/U32/FLOAT type";
 return false;
   }
 
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl 
b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
index 90c7cc2..9023107 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl
@@ -283,6 +283,8 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE 
y, uint c) { \
 SHUFFLE_DOWN(float)
 SHUFFLE_DOWN(int)
 SHUFFLE_DOWN(uint)
+SHUFFLE_DOWN(short)
+SHUFFLE_DOWN(ushort)
 #undef SHUFFLE_DOWN
 
 #define SHUFFLE_UP(TYPE) \
@@ -296,6 +298,8 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE 
y, uint c) { \
 SHUFFLE_UP(float)
 SHUFFLE_UP(int)
 SHUFFLE_UP(uint)
+SHUFFLE_UP(short)
+SHUFFLE_UP(ushort)
 #undef SHUFFLE_UP
 #define SHUFFLE_XOR(TYPE) \
 OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \
@@ -304,4 +308,6 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint 
c) { \
 SHUFFLE_XOR(float)
 SHUFFLE_XOR(int)
 SHUFFLE_XOR(uint)
+SHUFFLE_XOR(short)
+SHUFFLE_XOR(ushort)
 #undef SHUFFLE_XOR
diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h 
b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
index d0f06d1..158c8e1 100644
--- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h
+++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h
@@ -169,15 +169,26 @@ OVERLOADABLE half intel_sub_group_shuffle(half x, uint c);
 OVERLOADABLE float intel_sub_group_shuffle(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c);
+OVERLOADABLE short intel_sub_group_shuffle(short x, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle(ushort x, uint c);
+
 OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c);
 OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c);
+OVERLOADABLE short intel_sub_group_shuffle_down(short x, short y, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle_down(ushort x, ushort y, uint c);
+
 OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c);
 OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c);
+OVERLOADABLE short intel_sub_group_shuffle_up(short x, short y, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle_up(ushort x, ushort y, uint c);
+
 OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c);
 OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c);
 OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c);
+OVERLOADABLE short intel_sub_group_shuffle_xor(short x, uint c);
+OVERLOADABLE ushort intel_sub_group_shuffle_xor(ushort x, uint c);
 
 /* blocak read/write */
 OVERLOADABLE uint intel_sub_group_block_read(const global uint* p);
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 02/14] Backend: Refine GenRegiter::offset

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Convert the subnr into nr if the subnr is bigger than 32, this will make
offset usage universally.

Signed-off-by: Pan Xiuli 
---
 backend/src/backend/gen_register.hpp | 4 
 1 file changed, 4 insertions(+)

diff --git a/backend/src/backend/gen_register.hpp 
b/backend/src/backend/gen_register.hpp
index bbea761..bda35e4 100644
--- a/backend/src/backend/gen_register.hpp
+++ b/backend/src/backend/gen_register.hpp
@@ -268,6 +268,10 @@ namespace gbe
 
 static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) {
   GenRegister r = reg;
+  if(subnr >= 32){
+nr += subnr / 32;
+subnr = subnr % 32;
+  }
   r.nr += nr;
   r.subnr += subnr;
   r.subphysical = 1;
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet


[Beignet] [PATCH 07/14] Utest: Add test case for sub group broadcast short

2016-10-12 Thread Xiuli Pan
From: Pan Xiuli 

Signed-off-by: Pan Xiuli 
---
 kernels/compiler_subgroup_broadcast.cl | 10 ++
 utests/compiler_subgroup_broadcast.cpp | 11 +++
 2 files changed, 21 insertions(+)

diff --git a/kernels/compiler_subgroup_broadcast.cl 
b/kernels/compiler_subgroup_broadcast.cl
index 63e9568..3d16d67 100644
--- a/kernels/compiler_subgroup_broadcast.cl
+++ b/kernels/compiler_subgroup_broadcast.cl
@@ -32,6 +32,16 @@ kernel void compiler_subgroup_broadcast_long(global long 
*src,
   long broadcast_val = sub_group_broadcast(val, simd_id);
   dst[index] = broadcast_val;
 }
+kernel void compiler_subgroup_broadcast_short(global short *src,
+global short *dst,
+uint simd_id)
+{
+  uint index = get_global_id(0);
+
+  short val = src[index];
+  short broadcast_val = sub_group_broadcast(val, simd_id);
+  dst[index] = broadcast_val;
+}
 #else
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 kernel void compiler_subgroup_broadcast_half(global half *src,
diff --git a/utests/compiler_subgroup_broadcast.cpp 
b/utests/compiler_subgroup_broadcast.cpp
index 5aa749c..33ec43c 100644
--- a/utests/compiler_subgroup_broadcast.cpp
+++ b/utests/compiler_subgroup_broadcast.cpp
@@ -190,6 +190,17 @@ void compiler_subgroup_broadcast_long(void)
   subgroup_generic(input, expected);
 }
 MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_broadcast_long);
+void compiler_subgroup_broadcast_short(void)
+{
+  if(!cl_check_subgroups_short())
+return;
+  cl_short *input = NULL;
+  cl_short *expected = NULL;
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast",
+  "compiler_subgroup_broadcast_short");
+  subgroup_generic(input, expected);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_short);
 void compiler_subgroup_broadcast_half(void)
 {
   if(!cl_check_subgroups())
-- 
2.7.4

___
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet