From: Pan Xiuli <xiuli....@intel.com>

Signed-off-by: Pan Xiuli <xiuli....@intel.com>
---
 kernels/compiler_sub_group_shuffle.cl      | 22 +++++++++++-
 kernels/compiler_sub_group_shuffle_down.cl | 23 ++++++++++++-
 kernels/compiler_sub_group_shuffle_up.cl   | 23 ++++++++++++-
 kernels/compiler_sub_group_shuffle_xor.cl  | 23 ++++++++++++-
 utests/compiler_sub_group_shuffle.cpp      | 52 ++++++++++++++++++++++++++--
 utests/compiler_sub_group_shuffle_down.cpp | 54 ++++++++++++++++++++++++++++--
 utests/compiler_sub_group_shuffle_up.cpp   | 54 ++++++++++++++++++++++++++++--
 utests/compiler_sub_group_shuffle_xor.cpp  | 54 ++++++++++++++++++++++++++++--
 8 files changed, 289 insertions(+), 16 deletions(-)

diff --git a/kernels/compiler_sub_group_shuffle.cl 
b/kernels/compiler_sub_group_shuffle.cl
index 322da74..c771eea 100644
--- a/kernels/compiler_sub_group_shuffle.cl
+++ b/kernels/compiler_sub_group_shuffle.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_int(global int *dst, int c)
 {
   int i = get_global_id(0);
   if (i == 0)
@@ -16,3 +16,23 @@ __kernel void compiler_sub_group_shuffle(global int *dst, 
int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
 }
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_short(global short *dst, int c)
+{
+  short i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+
+  short from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  short o0 = get_sub_group_local_id();
+  short o1 = intel_sub_group_shuffle(from, c);
+  short o2 = intel_sub_group_shuffle(from, 5);
+  short o3 = intel_sub_group_shuffle(from, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_sub_group_shuffle_down.cl 
b/kernels/compiler_sub_group_shuffle_down.cl
index 769fc3f..40bac05 100644
--- a/kernels/compiler_sub_group_shuffle_down.cl
+++ b/kernels/compiler_sub_group_shuffle_down.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_down(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_down_int(global int *dst, int c)
 {
   int i = get_global_id(0);
   if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_down(global int 
*dst, int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
 }
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_down_short(global short *dst, int c)
+{
+  short i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+
+  short from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int k = get_sub_group_local_id() + 1;
+  short o0 = intel_sub_group_shuffle_down((short)123, (short)456, c);
+  short o1 = intel_sub_group_shuffle_down((short)123, from, c);
+  short o2 = intel_sub_group_shuffle_down(from, (short)-from, k);
+  short o3 = intel_sub_group_shuffle_down(from, (short)321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_sub_group_shuffle_up.cl 
b/kernels/compiler_sub_group_shuffle_up.cl
index 5c5cee1..fd287d5 100644
--- a/kernels/compiler_sub_group_shuffle_up.cl
+++ b/kernels/compiler_sub_group_shuffle_up.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_up(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_up_int(global int *dst, int c)
 {
   int i = get_global_id(0);
   if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_up(global int *dst, 
int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
 }
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_up_short(global short *dst, int c)
+{
+  short i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+
+  short from = i;
+  int j = get_sub_group_local_id() + 1;
+  int k = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  short o0 = intel_sub_group_shuffle_up((short)123, (short)456, c);
+  short o1 = intel_sub_group_shuffle_up((short)123, from, c);
+  short o2 = intel_sub_group_shuffle_up(from, (short)-from, k);
+  short o3 = intel_sub_group_shuffle_up(from, (short)321, j);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
+#endif
diff --git a/kernels/compiler_sub_group_shuffle_xor.cl 
b/kernels/compiler_sub_group_shuffle_xor.cl
index 8bc15d3..df3dfe7 100644
--- a/kernels/compiler_sub_group_shuffle_xor.cl
+++ b/kernels/compiler_sub_group_shuffle_xor.cl
@@ -1,4 +1,4 @@
-__kernel void compiler_sub_group_shuffle_xor(global int *dst, int c)
+__kernel void compiler_sub_group_shuffle_xor_int(global int *dst, int c)
 {
   int i = get_global_id(0);
   if (i == 0)
@@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_xor(global int 
*dst, int c)
   dst[i*4+2] = o2;
   dst[i*4+3] = o3;
 }
+#ifdef SHORT
+__kernel void compiler_sub_group_shuffle_xor_short(global short *dst, int c)
+{
+  short i = get_global_id(0);
+  if (i == 0)
+    dst[0] = get_max_sub_group_size();
+  dst++;
+
+  short from = i;
+  int j = get_max_sub_group_size() - get_sub_group_local_id() - 1;
+  int k = get_sub_group_local_id() + 1;
+  short o0 = get_sub_group_local_id();
+  short o1 = intel_sub_group_shuffle_xor(from, c);
+  short o2 = intel_sub_group_shuffle_xor(from, j);
+  short o3 = intel_sub_group_shuffle_xor(from, k);
+  dst[i*4] = o0;
+  dst[i*4+1] = o1;
+  dst[i*4+2] = o2;
+  dst[i*4+3] = o3;
+}
+#endif
diff --git a/utests/compiler_sub_group_shuffle.cpp 
b/utests/compiler_sub_group_shuffle.cpp
index f33e9de..2aadfed 100644
--- a/utests/compiler_sub_group_shuffle.cpp
+++ b/utests/compiler_sub_group_shuffle.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
 
-void compiler_sub_group_shuffle(void)
+void compiler_sub_group_shuffle_int(void)
 {
   if(!cl_check_subgroups())
     return;
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle(void)
   const int32_t buf_size = 4 * n + 1;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle");
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle",
+                              "compiler_sub_group_shuffle_int");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
@@ -43,5 +44,50 @@ void compiler_sub_group_shuffle(void)
   }
   OCL_UNMAP_BUFFER(0);
 }
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_int);
+void compiler_sub_group_shuffle_short(void)
+{
+  if(!cl_check_subgroups_short())
+    return;
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle.cl",
+                           "compiler_sub_group_shuffle_short",
+                           SOURCE, "-DSHORT");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  int c = 3;
+  OCL_SET_ARG(1, sizeof(int), &c);
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((short*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
 
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle);
+  // Compare
+  OCL_MAP_BUFFER(0);
+  short* dst = (short*)buf_data[0];
+  int suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / suggroupsize;
+    int index = i % suggroupsize;
+    OCL_ASSERT(index == dst[4*i]);
+    OCL_ASSERT((round * suggroupsize + c) == dst[4*i+1]);
+    OCL_ASSERT((round * suggroupsize + 5) == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (suggroupsize - index - 1)) == 
dst[4*i+3]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_short);
diff --git a/utests/compiler_sub_group_shuffle_down.cpp 
b/utests/compiler_sub_group_shuffle_down.cpp
index 8b23234..13f6e12 100644
--- a/utests/compiler_sub_group_shuffle_down.cpp
+++ b/utests/compiler_sub_group_shuffle_down.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
 
-void compiler_sub_group_shuffle_down(void)
+void compiler_sub_group_shuffle_down_int(void)
 {
   if(!cl_check_subgroups())
     return;
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_down(void)
   const int32_t buf_size = 4 * n + 1;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_down");
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_down",
+                              "compiler_sub_group_shuffle_down_int");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
@@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_down(void)
   }
   OCL_UNMAP_BUFFER(0);
 }
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down_int);
 
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down);
+void compiler_sub_group_shuffle_down_short(void)
+{
+  if(!cl_check_subgroups_short())
+    return;
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_down.cl",
+                           "compiler_sub_group_shuffle_down_short",
+                           SOURCE, "-DSHORT");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  int c = 13;
+  OCL_SET_ARG(1, sizeof(int), &c);
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((short*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  short* dst = (short *)buf_data[0];
+  short suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / suggroupsize;
+    int index = i % suggroupsize;
+    //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+    OCL_ASSERT( (index + c >= suggroupsize ? 456 : 123) == dst[4*i]);
+    OCL_ASSERT( (index + c >= suggroupsize ? (round * suggroupsize + (i + c) % 
suggroupsize): 123) == dst[4*i+1]);
+    OCL_ASSERT( (index + index + 1 >= suggroupsize ? -(round * suggroupsize + 
(i + index + 1) % suggroupsize) : (round * suggroupsize + (i + index + 1) % 
suggroupsize))  == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_down_short);
diff --git a/utests/compiler_sub_group_shuffle_up.cpp 
b/utests/compiler_sub_group_shuffle_up.cpp
index d2e054b..f79f03c 100644
--- a/utests/compiler_sub_group_shuffle_up.cpp
+++ b/utests/compiler_sub_group_shuffle_up.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
 
-void compiler_sub_group_shuffle_up(void)
+void compiler_sub_group_shuffle_up_int(void)
 {
   if(!cl_check_subgroups())
     return;
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_up(void)
   const int32_t buf_size = 4 * n + 1;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_up");
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_up",
+                              "compiler_sub_group_shuffle_up_int");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
@@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_up(void)
   }
   OCL_UNMAP_BUFFER(0);
 }
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up_int);
 
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up);
+void compiler_sub_group_shuffle_up_short(void)
+{
+  if(!cl_check_subgroups_short())
+    return;
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_up.cl",
+                           "compiler_sub_group_shuffle_up_short",
+                           SOURCE, "-DSHORT");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  int c = 3;
+  OCL_SET_ARG(1, sizeof(int), &c);
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((short*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  short* dst = (short *)buf_data[0];
+  short suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / suggroupsize;
+    int index = i % suggroupsize;
+    //printf("%d %d %d %d\n",dst[4*i], dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+    OCL_ASSERT( ((c - index) > 0 ? 123 : 456) == dst[4*i]);
+    OCL_ASSERT( ((c - index) > 0 ? 123 : (i - c)) == dst[4*i+1]);
+    OCL_ASSERT( ((suggroupsize - index - 1 - index) > 0 ? (i + index + 1) : 
-(i + index + 1 - suggroupsize)) == dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (suggroupsize - 1)) == dst[4*i+3]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_up_short);
diff --git a/utests/compiler_sub_group_shuffle_xor.cpp 
b/utests/compiler_sub_group_shuffle_xor.cpp
index 967ec3e..b0ad3ee 100644
--- a/utests/compiler_sub_group_shuffle_xor.cpp
+++ b/utests/compiler_sub_group_shuffle_xor.cpp
@@ -1,6 +1,6 @@
 #include "utest_helper.hpp"
 
-void compiler_sub_group_shuffle_xor(void)
+void compiler_sub_group_shuffle_xor_int(void)
 {
   if(!cl_check_subgroups())
     return;
@@ -8,7 +8,8 @@ void compiler_sub_group_shuffle_xor(void)
   const int32_t buf_size = 4 * n + 1;
 
   // Setup kernel and buffers
-  OCL_CREATE_KERNEL("compiler_sub_group_shuffle_xor");
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_sub_group_shuffle_xor",
+                              "compiler_sub_group_shuffle_xor_int");
   OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(int), NULL);
   OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
 
@@ -44,5 +45,52 @@ void compiler_sub_group_shuffle_xor(void)
   }
   OCL_UNMAP_BUFFER(0);
 }
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor_int);
 
-MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor);
+void compiler_sub_group_shuffle_xor_short(void)
+{
+  if(!cl_check_subgroups_short())
+    return;
+  const size_t n = 32;
+  const int32_t buf_size = 4 * n + 1;
+
+  // Setup kernel and buffers
+  OCL_CALL(cl_kernel_init, "compiler_sub_group_shuffle_xor.cl",
+                           "compiler_sub_group_shuffle_xor_short",
+                           SOURCE, "-DSHORT");
+  OCL_CREATE_BUFFER(buf[0], 0, buf_size * sizeof(short), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+
+  int c = 3;
+  OCL_SET_ARG(1, sizeof(int), &c);
+
+  globals[0] = n;
+  locals[0] = 16;
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < buf_size; ++i)
+    ((short*)buf_data[0])[i] = -1;
+  OCL_UNMAP_BUFFER(0);
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  short* dst = (short *)buf_data[0];
+  short suggroupsize = dst[0];
+  OCL_ASSERT(suggroupsize == 8 || suggroupsize == 16);
+
+  dst++;
+  for (int32_t i = 0; i < (int32_t) n; ++i){
+    int round = i / suggroupsize;
+    int index = i % suggroupsize;
+    OCL_ASSERT(index == dst[4*i]);
+    //printf("%d %d %d %d\n", i, dst[4*i+1], dst[4*i+2], dst[4*i+3]);
+    OCL_ASSERT((round * suggroupsize + (c ^ index)) == dst[4*i+1]);
+    OCL_ASSERT((round * suggroupsize + (index ^ (suggroupsize - index -1))) == 
dst[4*i+2]);
+    OCL_ASSERT((round * suggroupsize + (index ^ (index + 1) % suggroupsize)) 
== dst[4*i+3]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+MAKE_UTEST_FROM_FUNCTION(compiler_sub_group_shuffle_xor_short);
-- 
2.7.4

_______________________________________________
Beignet mailing list
Beignet@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/beignet

Reply via email to