Re: [Beignet] [PATCH] Fix build failure with CMRT enabled
LGTM, thanks. -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Rebecca N. Palmer Sent: Thursday, October 13, 2016 6:15 AM To: beignet@lists.freedesktop.org Subject: [Beignet] [PATCH] Fix build failure with CMRT enabled 2baff9c moved mem->magic to cl_base_object. --- (Or should this be CL_OBJECT_IS_MEM(mem), i.e. also checking the reference count?) --- a/src/cl_cmrt.cpp +++ b/src/cl_cmrt.cpp @@ -256,7 +256,7 @@ cl_int cmrt_set_kernel_arg(cl_kernel k, result = cmrt_kernel->SetKernelArg(index, sz, value); else { cl_mem mem = *(cl_mem*)value; -if (mem->magic == CL_MAGIC_MEM_HEADER) { +if (((cl_base_object)mem)->magic == CL_MAGIC_MEM_HEADER) { if (!CreateCmrtMemory(mem)) return CL_INVALID_ARG_VALUE; ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH] Fix build failure with CMRT enabled
2baff9c moved mem->magic to cl_base_object. --- (Or should this be CL_OBJECT_IS_MEM(mem), i.e. also checking the reference count?) --- a/src/cl_cmrt.cpp +++ b/src/cl_cmrt.cpp @@ -256,7 +256,7 @@ cl_int cmrt_set_kernel_arg(cl_kernel k, result = cmrt_kernel->SetKernelArg(index, sz, value); else { cl_mem mem = *(cl_mem*)value; -if (mem->magic == CL_MAGIC_MEM_HEADER) { +if (((cl_base_object)mem)->magic == CL_MAGIC_MEM_HEADER) { if (!CreateCmrtMemory(mem)) return CL_INVALID_ARG_VALUE; ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
Re: [Beignet] [PATCH v3] Utests: Allow testing cl_intel_accelerator via ICD
Hi Rebecca, This version LGTM except some points need to be minor refined. Just see my comments below. -Original Message- From: Beignet [mailto:beignet-boun...@lists.freedesktop.org] On Behalf Of Rebecca N. Palmer Sent: Wednesday, October 12, 2016 5:50 AM To: Weng, Chuanbo; beignet@lists.freedesktop.org Subject: Re: [Beignet] [PATCH v3] Utests: Allow testing cl_intel_accelerator via ICD v3: Use extension check, not beignet check. Treat claiming to have the extension but not having the kernel as a failure. --- (v2 was the un-numbered 10/10/16 08:07 version...which I subsequently noticed was broken in that it assumed a non-NULL clGetExtensionFunctionAddressForPlatform result meant the extension was supported, which it doesn't in general, https://www.khronos.org/registry/cl/sdk/2.1/docs/man/xhtml/clGetExtensionFunctionAddressForPlatform.html ) --- a/utests/builtin_kernel_block_motion_estimate_intel.cpp +++ b/utests/builtin_kernel_block_motion_estimate_intel.cpp @@ -8,6 +8,19 @@ OCLRELEASEACCELERATORINTEL * oclReleaseA void builtin_kernel_block_motion_estimate_intel(void) { + std::string extStr; + size_t param_value_size; + OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, + _value_size); std::vector param_value(param_value_size); + OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size, + param_value.empty() ? NULL : _value.front(), + _value_size); if (!param_value.empty()) +extStr = std::string(_value.front(), param_value_size-1); // + cl_intel_motion_estimation depends on cl_intel_accelerator, so we only + need to check one if (strstr(extStr.c_str(), "cl_intel_motion_estimation") == NULL) { +printf("No cl_intel_motion_estimation, Skip!"); +return; + } [Chuanbo] It would be better if you wrapper this part of code into cl_check_motion_estimation() and then move it to utest_helper.cpp. This will keep existing code organization style. There is a bug in Beignet: cl_intel_motion_estimation is supported by IVB only, but all devices show string cl_intel_motion_estimation in their CL_DEVICE_EXTENSIONS. I'll work out a patch to fix this problem. char* built_in_kernel_names; size_t built_in_kernels_size; cl_int err = CL_SUCCESS; @@ -21,7 +34,8 @@ void builtin_kernel_block_motion_estimat if (strstr(built_in_kernel_names, "block_motion_estimate_intel") == NULL) { free(built_in_kernel_names); -return; +printf("Can't find block_motion_estimate_intel built-in kernel"); [Chuanbo] Although I know there are somewhere else using printf instead of fprintf(stderr, ...), let's keep in mind that we should better use fprintf(stderr, ...) for output of error info. +OCL_ASSERT(0); } cl_program built_in_prog = clCreateProgramWithBuiltInKernels(ctx, 1, , built_in_kernel_names, ); --- a/utests/CMakeLists.txt +++ b/utests/CMakeLists.txt @@ -287,7 +287,8 @@ set (utests_sources multi_queue_events.cpp compiler_mix.cpp compiler_math_3op.cpp - compiler_bsort.cpp) + compiler_bsort.cpp + builtin_kernel_block_motion_estimate_intel.cpp) if (LLVM_VERSION_NODOT VERSION_GREATER 34) SET(utests_sources @@ -328,7 +329,6 @@ else(GEN_PCI_ID) endif(GEN_PCI_ID) if (NOT_BUILD_STAND_ALONE_UTEST) - SET(utests_sources ${utests_sources} builtin_kernel_block_motion_estimate_intel.cpp) ADD_CUSTOM_TARGET(kernel_bin.bin DEPENDS ${kernel_bin}.bin) endif (NOT_BUILD_STAND_ALONE_UTEST) ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH 10/14] Utest: Add test case for sub group short builtin functions
From: Pan XiuliSigned-off-by: Pan Xiuli --- kernels/compiler_subgroup_reduce.cl | 22 ++ kernels/compiler_subgroup_scan_exclusive.cl | 36 kernels/compiler_subgroup_scan_inclusive.cl | 36 utests/compiler_subgroup_reduce.cpp | 66 + utests/compiler_subgroup_scan_exclusive.cpp | 66 + utests/compiler_subgroup_scan_inclusive.cpp | 66 + 6 files changed, 292 insertions(+) diff --git a/kernels/compiler_subgroup_reduce.cl b/kernels/compiler_subgroup_reduce.cl index 6d7ecfd..79d8e7d 100644 --- a/kernels/compiler_subgroup_reduce.cl +++ b/kernels/compiler_subgroup_reduce.cl @@ -73,6 +73,17 @@ kernel void compiler_subgroup_reduce_add_float(global float *src, global float * /* * Subgroup reduce max functions */ +kernel void compiler_subgroup_reduce_max_short(global short *src, global short *dst) { + short val = src[get_global_id(0)]; + short sum = sub_group_reduce_max(val); + dst[get_global_id(0)] = sum; +} +kernel void compiler_subgroup_reduce_max_ushort(global ushort *src, global ushort *dst) { + ushort val = src[get_global_id(0)]; + //printf("src is %d\n",val); + ushort sum = sub_group_reduce_max(val); + dst[get_global_id(0)] = sum; +} kernel void compiler_subgroup_reduce_max_int(global int *src, global int *dst) { int val = src[get_global_id(0)]; int sum = sub_group_reduce_max(val); @@ -106,6 +117,17 @@ kernel void compiler_subgroup_reduce_max_float(global float *src, global float * /* * Subgroup reduce min functions */ +kernel void compiler_subgroup_reduce_min_short(global short *src, global short *dst) { + short val = src[get_global_id(0)]; + short sum = sub_group_reduce_min(val); + dst[get_global_id(0)] = sum; +} +kernel void compiler_subgroup_reduce_min_ushort(global ushort *src, global ushort *dst) { + ushort val = src[get_global_id(0)]; + //printf("src is %d\n",val); + ushort sum = sub_group_reduce_min(val); + dst[get_global_id(0)] = sum; +} kernel void compiler_subgroup_reduce_min_int(global int *src, global int *dst) { int val = src[get_global_id(0)]; int sum = sub_group_reduce_min(val); diff --git a/kernels/compiler_subgroup_scan_exclusive.cl b/kernels/compiler_subgroup_scan_exclusive.cl index ca0ada2..2c4b928 100644 --- a/kernels/compiler_subgroup_scan_exclusive.cl +++ b/kernels/compiler_subgroup_scan_exclusive.cl @@ -2,6 +2,18 @@ * Subgroup scan exclusive add functions */ #ifndef HALF +kernel void compiler_subgroup_scan_exclusive_add_short(global short *src, global short *dst) { + short val = src[get_global_id(0)]; + short sum = sub_group_scan_exclusive_add(val); + dst[get_global_id(0)] = sum; +} + +kernel void compiler_subgroup_scan_exclusive_add_ushort(global ushort *src, global ushort *dst) { + ushort val = src[get_global_id(0)]; + ushort sum = sub_group_scan_exclusive_add(val); + dst[get_global_id(0)] = sum; +} + kernel void compiler_subgroup_scan_exclusive_add_int(global int *src, global int *dst) { int val = src[get_global_id(0)]; int sum = sub_group_scan_exclusive_add(val); @@ -35,6 +47,18 @@ kernel void compiler_subgroup_scan_exclusive_add_float(global float *src, global /* * Subgroup scan exclusive max functions */ +kernel void compiler_subgroup_scan_exclusive_max_short(global short *src, global short *dst) { + short val = src[get_global_id(0)]; + short sum = sub_group_scan_exclusive_max(val); + dst[get_global_id(0)] = sum; +} + +kernel void compiler_subgroup_scan_exclusive_max_ushort(global ushort *src, global ushort *dst) { + ushort val = src[get_global_id(0)]; + ushort sum = sub_group_scan_exclusive_max(val); + dst[get_global_id(0)] = sum; +} + kernel void compiler_subgroup_scan_exclusive_max_int(global int *src, global int *dst) { int val = src[get_global_id(0)]; int sum = sub_group_scan_exclusive_max(val); @@ -68,6 +92,18 @@ kernel void compiler_subgroup_scan_exclusive_max_float(global float *src, global /* * Subgroup scan exclusive min functions */ +kernel void compiler_subgroup_scan_exclusive_min_short(global short *src, global short *dst) { + short val = src[get_global_id(0)]; + short sum = sub_group_scan_exclusive_min(val); + dst[get_global_id(0)] = sum; +} + +kernel void compiler_subgroup_scan_exclusive_min_ushort(global ushort *src, global ushort *dst) { + ushort val = src[get_global_id(0)]; + ushort sum = sub_group_scan_exclusive_min(val); + dst[get_global_id(0)] = sum; +} + kernel void compiler_subgroup_scan_exclusive_min_int(global int *src, global int *dst) { int val = src[get_global_id(0)]; int sum = sub_group_scan_exclusive_min(val); diff --git a/kernels/compiler_subgroup_scan_inclusive.cl b/kernels/compiler_subgroup_scan_inclusive.cl index e97521c..def941c 100644 --- a/kernels/compiler_subgroup_scan_inclusive.cl +++ b/kernels/compiler_subgroup_scan_inclusive.cl @@ -2,6 +2,18
[Beignet] [PATCH 06/14] Utest: Add check subgroup short helper function
From: Pan XiuliCheck if the device support intel_subgroups_short extension, also check if the device support intel_subgroups extension first. Signed-off-by: Pan Xiuli --- utests/utest_helper.cpp | 20 utests/utest_helper.hpp | 2 ++ 2 files changed, 22 insertions(+) diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp index d3fc069..f1f5af4 100644 --- a/utests/utest_helper.cpp +++ b/utests/utest_helper.cpp @@ -899,6 +899,26 @@ int cl_check_subgroups(void) return 1; } +int cl_check_subgroups_short(void) +{ + if (!cl_check_subgroups()) +return 0; + std::string extStr; + size_t param_value_size; + OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, 0, 0, _value_size); + std::vector param_value(param_value_size); + OCL_CALL(clGetDeviceInfo, device, CL_DEVICE_EXTENSIONS, param_value_size, + param_value.empty() ? NULL : _value.front(), _value_size); + if (!param_value.empty()) +extStr = std::string(_value.front(), param_value_size-1); + + if (std::strstr(extStr.c_str(), "cl_intel_subgroups_short") == NULL) { +printf("No cl_intel_subgroups_short, Skip!"); +return 0; + } + return 1; +} + int cl_check_ocl20(void) { size_t param_value_size; diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp index 034a411..0f4a1ef 100644 --- a/utests/utest_helper.hpp +++ b/utests/utest_helper.hpp @@ -318,4 +318,6 @@ extern uint32_t __half_to_float(uint16_t h, bool* isInf = NULL, bool* infSign = extern uint16_t __float_to_half(uint32_t x); extern float as_float(uint32_t i); extern uint32_t as_uint(float f); +/* Check is intel subgroups short enabled. */ +extern int cl_check_subgroups_short(void); #endif /* __UTEST_HELPER_HPP__ */ -- 2.7.4 ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH 03/14] Backend: Refine register offset for simd shuffle
From: Pan XiuliSimd shuffle should support different type, we used to support float or dword type. Now we can set offset by src type. Signed-off-by: Pan Xiuli --- backend/src/backend/gen_context.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index 4f73237..e907931 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -719,7 +719,7 @@ namespace gbe p->curr.quarterControl = 1; p->ADD(a0, GenRegister::unpacked_uw(src1.nr+1, src1.subnr / typeSize(GEN_TYPE_UW)), baseReg); -p->MOV(GenRegister::offset(dst, 1, 0), indirect); +p->MOV(GenRegister::offset(dst, 0, 8 * typeSize(src0.type)), indirect); } else NOT_IMPLEMENTED; p->pop(); -- 2.7.4 ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH 05/14] Libocl: Add sub group broadcast short builtin function
From: Pan XiuliAdd sub group broadcast and intel sub group broadcast for short type. Signed-off-by: Pan Xiuli --- backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 9 + backend/src/libocl/tmpl/ocl_simd.tmpl.h | 4 2 files changed, 13 insertions(+) diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl index 245ce8a..d1bcfa3 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl @@ -50,8 +50,17 @@ BROADCAST_IMPL(ulong) BROADCAST_IMPL(half) BROADCAST_IMPL(float) BROADCAST_IMPL(double) +BROADCAST_IMPL(short) +BROADCAST_IMPL(ushort) #undef BROADCAST_IMPL +OVERLOADABLE short intel_sub_group_broadcast(short a, uint local_id) { + return __gen_ocl_sub_group_broadcast(a, local_id); +} + +OVERLOADABLE ushort intel_sub_group_broadcast(ushort a, uint local_id) { + return __gen_ocl_sub_group_broadcast(a, local_id); +} #define RANGE_OP(RANGE, OP, GEN_TYPE, SIGN) \ OVERLOADABLE GEN_TYPE __gen_ocl_sub_group_##RANGE##_##OP(bool sign, GEN_TYPE x); \ diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h index e8dc6f4..c609c2e 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h @@ -42,7 +42,11 @@ OVERLOADABLE ulong sub_group_broadcast(ulong a, uint local_id); OVERLOADABLE half sub_group_broadcast(half a, uint local_id); OVERLOADABLE float sub_group_broadcast(float a, uint local_id); OVERLOADABLE double sub_group_broadcast(double a, uint local_id); +OVERLOADABLE short sub_group_broadcast(short a,uint local_id); +OVERLOADABLE ushort sub_group_broadcast(ushort a, uint local_id); +OVERLOADABLE short intel_sub_group_broadcast(short a, uint local_id); +OVERLOADABLE ushort intel_sub_group_broadcast(ushort a, uint local_id); /* reduce add */ OVERLOADABLE int sub_group_reduce_add(int x); OVERLOADABLE uint sub_group_reduce_add(uint x); -- 2.7.4 ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH 13/14] Backend: Add subgroup short block read/write
From: Pan XiuliAdd intel subgroup short mem bleck read/write and image block read/write also fix some old block read/write bug. Refine old uint block read/write with _ui suffix. Signed-off-by: Pan Xiuli --- backend/src/backend/gen_context.cpp| 190 + backend/src/backend/gen_encoder.cpp| 26 +++- backend/src/backend/gen_insn_selection.cpp | 37 +++-- backend/src/ir/instruction.cpp | 26 ++-- backend/src/ir/instruction.hpp | 6 +- backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 221 - backend/src/libocl/tmpl/ocl_simd.tmpl.h| 48 ++- backend/src/llvm/llvm_gen_backend.cpp | 125 +++- backend/src/llvm/llvm_gen_ocl_function.hxx | 50 --- backend/src/llvm/llvm_scalarize.cpp| 42 -- 10 files changed, 573 insertions(+), 198 deletions(-) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index a1ae5ea..6bb0f22 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -3501,12 +3501,14 @@ namespace gbe } void GenContext::emitOBReadInstruction(const SelectionInstruction ) { -const GenRegister dst= GenRegister::retype(ra->genReg(insn.dst(1)), GEN_TYPE_UD); +const GenRegister dst= ra->genReg(insn.dst(1)); +uint32_t type = dst.type; +uint32_t typesize = typeSize(type); const GenRegister addr = GenRegister::toUniform(ra->genReg(insn.src(0)), GEN_TYPE_UD); const GenRegister header = GenRegister::retype(ra->genReg(insn.dst(0)), GEN_TYPE_UD); const GenRegister headeraddr = GenRegister::offset(header, 0, 2*4); const uint32_t vec_size = insn.extra.elem; -const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), GEN_TYPE_UD); +const GenRegister tmp = GenRegister::retype(ra->genReg(insn.dst(1 + vec_size)), type); const uint32_t simdWidth = p->curr.execWidth; // Make header @@ -3532,7 +3534,7 @@ namespace gbe { p->curr.execWidth = 16; p->curr.noMask = 1; -p->OBREAD(dst, header, insn.getbti(), simdWidth / 4); +p->OBREAD(dst, header, insn.getbti(), simdWidth * typesize / 16); } p->pop(); } else if (vec_size == 2) { @@ -3540,14 +3542,41 @@ namespace gbe { p->curr.execWidth = 16; p->curr.noMask = 1; -p->OBREAD(tmp, header, insn.getbti(), simdWidth / 2); +p->OBREAD(tmp, header, insn.getbti(), simdWidth * typesize / 8); } p->pop(); p->MOV(ra->genReg(insn.dst(1)), GenRegister::offset(tmp, 0)); - p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, simdWidth / 8)); -} else if (vec_size == 4 || vec_size == 8) { + p->MOV(ra->genReg(insn.dst(2)), GenRegister::offset(tmp, 0, simdWidth * typesize )); +} else if (vec_size == 4) { if (simdWidth == 8) { -for (uint32_t i = 0; i < vec_size / 4; i++) { +p->push(); +{ + p->curr.execWidth = 16; + p->curr.noMask = 1; + p->OBREAD(tmp, header, insn.getbti(), 2 * typesize); +} +p->pop(); +for (uint32_t j = 0; j < 4; j++) + p->MOV(ra->genReg(insn.dst(1 + j)), GenRegister::offset(tmp, 0, j * simdWidth * typesize )); + } else { +for (uint32_t i = 0; i < typesize / 2; i++) { + if (i > 0) { +p->push(); +{ + // Update the address in header + p->curr.execWidth = 1; + p->ADD(headeraddr, headeraddr, GenRegister::immud(128)); +} +p->pop(); + } + p->OBREAD(tmp, header, insn.getbti(), 8); + for (uint32_t j = 0; j < 8 / typesize ; j++) +p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp, 0 ,j * simdWidth * typesize )); +} + } +} else if (vec_size == 8) { + if (simdWidth == 8) { +for (uint32_t i = 0; i < typesize / 2; i++) { if (i > 0) { p->push(); { @@ -3564,11 +3593,11 @@ namespace gbe p->OBREAD(tmp, header, insn.getbti(), 8); } p->pop(); - for (uint32_t j = 0; j < 4; j++) -p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, j)); + for (uint32_t j = 0; j < 16 / typesize; j++) +p->MOV(ra->genReg(insn.dst(1 + j + i * 4)), GenRegister::offset(tmp, 0, j * simdWidth * typesize )); } } else { -for (uint32_t i = 0; i < vec_size / 2; i++) { +for (uint32_t i = 0; i < typesize ; i++) { if (i > 0) { p->push(); { @@ -3579,8 +3608,8 @@ namespace gbe p->pop(); } p->OBREAD(tmp, header, insn.getbti(), 8); - for (uint32_t j = 0; j < 2; j++) -p->MOV(ra->genReg(insn.dst(1 + j + i * 2)), GenRegister::offset(tmp,
[Beignet] [PATCH 08/14] Backend: Change the sel ir optimization for unpack register
From: Pan XiuliTo unpack UW we may need to add mov and we do not want this mov to be optimizated by the sel ir optimization. Add check for hstrid to avoid this kind optimization. Signed-off-by: Pan Xiuli --- backend/src/backend/gen_insn_selection_optimize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/src/backend/gen_insn_selection_optimize.cpp b/backend/src/backend/gen_insn_selection_optimize.cpp index b8aa776..56c7615 100644 --- a/backend/src/backend/gen_insn_selection_optimize.cpp +++ b/backend/src/backend/gen_insn_selection_optimize.cpp @@ -161,7 +161,7 @@ namespace gbe assert(insn.opcode == SEL_OP_MOV); const GenRegister& src = insn.src(0); const GenRegister& dst = insn.dst(0); -if (src.type != dst.type || src.file != dst.file) +if (src.type != dst.type || src.file != dst.file || src.hstride != dst.hstride) return; if (liveout.find(dst.reg()) != liveout.end()) -- 2.7.4 ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH 09/14] Backend: Add short sub group builtin functions
From: Pan XiuliAdd intel sub group short type builtins. Signed-off-by: Pan Xiuli --- backend/src/backend/gen_context.cpp | 12 + backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 44 backend/src/libocl/tmpl/ocl_simd.tmpl.h | 36 ++ 3 files changed, 92 insertions(+) diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp index e907931..a1ae5ea 100644 --- a/backend/src/backend/gen_context.cpp +++ b/backend/src/backend/gen_context.cpp @@ -2878,6 +2878,10 @@ namespace gbe p->MOV(dataReg, GenRegister::immint64(0x0)); else if (dataReg.type == GEN_TYPE_UL) p->MOV(dataReg, GenRegister::immuint64(0x0)); + else if (dataReg.type == GEN_TYPE_W) +p->MOV(dataReg, GenRegister::immw(0x0)); + else if (dataReg.type == GEN_TYPE_UW) +p->MOV(dataReg, GenRegister::immuw(0x0)); else GBE_ASSERT(0); /* unsupported data-type */ } @@ -2896,6 +2900,10 @@ namespace gbe p->MOV(dataReg, GenRegister::immint64(0x7FFFL)); else if (dataReg.type == GEN_TYPE_UL) p->MOV(dataReg, GenRegister::immuint64(0xL)); + else if (dataReg.type == GEN_TYPE_W) +p->MOV(dataReg, GenRegister::immw(0x7FFF)); + else if (dataReg.type == GEN_TYPE_UW) +p->MOV(dataReg, GenRegister::immuw(0x)); else GBE_ASSERT(0); /* unsupported data-type */ } @@ -2914,6 +2922,10 @@ namespace gbe p->MOV(dataReg, GenRegister::immint64(0x8000L)); else if (dataReg.type == GEN_TYPE_UL) p->MOV(dataReg, GenRegister::immuint64(0x0)); + else if (dataReg.type == GEN_TYPE_W) +p->MOV(dataReg, GenRegister::immw(0x8000)); + else if (dataReg.type == GEN_TYPE_UW) +p->MOV(dataReg, GenRegister::immuw(0x0)); else GBE_ASSERT(0); /* unsupported data-type */ } diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl index d1bcfa3..90c7cc2 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl @@ -76,6 +76,8 @@ RANGE_OP(reduce, add, ulong, false) RANGE_OP(reduce, add, half, true) RANGE_OP(reduce, add, float, true) RANGE_OP(reduce, add, double, true) +RANGE_OP(reduce, add, short, true) +RANGE_OP(reduce, add, ushort, false) /* reduce min */ RANGE_OP(reduce, min, int, true) RANGE_OP(reduce, min, uint, false) @@ -84,6 +86,8 @@ RANGE_OP(reduce, min, ulong, false) RANGE_OP(reduce, min, half, true) RANGE_OP(reduce, min, float, true) RANGE_OP(reduce, min, double, true) +RANGE_OP(reduce, min, short, true) +RANGE_OP(reduce, min, ushort, false) /* reduce max */ RANGE_OP(reduce, max, int, true) RANGE_OP(reduce, max, uint, false) @@ -92,6 +96,8 @@ RANGE_OP(reduce, max, ulong, false) RANGE_OP(reduce, max, half, true) RANGE_OP(reduce, max, float, true) RANGE_OP(reduce, max, double, true) +RANGE_OP(reduce, max, short, true) +RANGE_OP(reduce, max, ushort, false) /* scan_inclusive add */ RANGE_OP(scan_inclusive, add, int, true) @@ -101,6 +107,8 @@ RANGE_OP(scan_inclusive, add, ulong, false) RANGE_OP(scan_inclusive, add, half, true) RANGE_OP(scan_inclusive, add, float, true) RANGE_OP(scan_inclusive, add, double, true) +RANGE_OP(scan_inclusive, add, short, true) +RANGE_OP(scan_inclusive, add, ushort, false) /* scan_inclusive min */ RANGE_OP(scan_inclusive, min, int, true) RANGE_OP(scan_inclusive, min, uint, false) @@ -109,6 +117,8 @@ RANGE_OP(scan_inclusive, min, ulong, false) RANGE_OP(scan_inclusive, min, half, true) RANGE_OP(scan_inclusive, min, float, true) RANGE_OP(scan_inclusive, min, double, true) +RANGE_OP(scan_inclusive, min, short, true) +RANGE_OP(scan_inclusive, min, ushort, false) /* scan_inclusive max */ RANGE_OP(scan_inclusive, max, int, true) RANGE_OP(scan_inclusive, max, uint, false) @@ -117,6 +127,8 @@ RANGE_OP(scan_inclusive, max, ulong, false) RANGE_OP(scan_inclusive, max, half, true) RANGE_OP(scan_inclusive, max, float, true) RANGE_OP(scan_inclusive, max, double, true) +RANGE_OP(scan_inclusive, max, short, true) +RANGE_OP(scan_inclusive, max, ushort, false) /* scan_exclusive add */ RANGE_OP(scan_exclusive, add, int, true) @@ -126,6 +138,8 @@ RANGE_OP(scan_exclusive, add, ulong, false) RANGE_OP(scan_exclusive, add, half, true) RANGE_OP(scan_exclusive, add, float, true) RANGE_OP(scan_exclusive, add, double, true) +RANGE_OP(scan_exclusive, add, short, true) +RANGE_OP(scan_exclusive, add, ushort, false) /* scan_exclusive min */ RANGE_OP(scan_exclusive, min, int, true) RANGE_OP(scan_exclusive, min, uint, false) @@ -134,6 +148,8 @@ RANGE_OP(scan_exclusive, min, ulong, false) RANGE_OP(scan_exclusive, min, half, true) RANGE_OP(scan_exclusive, min, float, true) RANGE_OP(scan_exclusive, min, double, true) +RANGE_OP(scan_exclusive, min, short, true) +RANGE_OP(scan_exclusive, min, ushort,
[Beignet] [PATCH 01/14] Libocl: Add intel_subgroups_short extension
From: Pan XiuliWe support intel_subgroups_short extension. Signed-off-by: Pan Xiuli --- backend/src/libocl/include/ocl.h | 1 + src/cl_extensions.h | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h index 5e3a788..6230b93 100644 --- a/backend/src/libocl/include/ocl.h +++ b/backend/src/libocl/include/ocl.h @@ -114,6 +114,7 @@ #define cl_khr_fp16 #define cl_khr_3d_image_writes #define cl_intel_subgroups +#define cl_intel_subgroups_short #pragma OPENCL EXTENSION cl_khr_fp64 : disable #pragma OPENCL EXTENSION cl_khr_fp16 : disable diff --git a/src/cl_extensions.h b/src/cl_extensions.h index 1139775..c32e085 100644 --- a/src/cl_extensions.h +++ b/src/cl_extensions.h @@ -28,7 +28,8 @@ #define DECL_INTEL_EXTENSIONS \ DECL_EXT(intel_accelerator) \ DECL_EXT(intel_motion_estimation) \ - DECL_EXT(intel_subgroups) + DECL_EXT(intel_subgroups) \ + DECL_EXT(intel_subgroups_short) #define DECL_GL_EXTENSIONS \ DECL_EXT(khr_gl_sharing)\ @@ -63,7 +64,7 @@ cl_khr_extension_id_max #define OPT1_EXT_START_ID EXT_ID(khr_int64_base_atomics) #define OPT1_EXT_END_ID EXT_ID(khr_icd) #define INTEL_EXT_START_ID EXT_ID(intel_accelerator) -#define INTEL_EXT_END_ID EXT_ID(intel_subgroups) +#define INTEL_EXT_END_ID EXT_ID(intel_subgroups_short) #define GL_EXT_START_ID EXT_ID(khr_gl_sharing) #define GL_EXT_END_ID EXT_ID(khr_gl_msaa_sharing) -- 2.7.4 ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH 12/14] Utest: Add test case for short type sub group shuffle
From: Pan XiuliSigned-off-by: Pan Xiuli --- kernels/compiler_sub_group_shuffle.cl | 22 +++- kernels/compiler_sub_group_shuffle_down.cl | 23 - kernels/compiler_sub_group_shuffle_up.cl | 23 - kernels/compiler_sub_group_shuffle_xor.cl | 23 - utests/compiler_sub_group_shuffle.cpp | 52 ++-- utests/compiler_sub_group_shuffle_down.cpp | 54 -- utests/compiler_sub_group_shuffle_up.cpp | 54 -- utests/compiler_sub_group_shuffle_xor.cpp | 54 -- 8 files changed, 289 insertions(+), 16 deletions(-) diff --git a/kernels/compiler_sub_group_shuffle.cl b/kernels/compiler_sub_group_shuffle.cl index 322da74..c771eea 100644 --- a/kernels/compiler_sub_group_shuffle.cl +++ b/kernels/compiler_sub_group_shuffle.cl @@ -1,4 +1,4 @@ -__kernel void compiler_sub_group_shuffle(global int *dst, int c) +__kernel void compiler_sub_group_shuffle_int(global int *dst, int c) { int i = get_global_id(0); if (i == 0) @@ -16,3 +16,23 @@ __kernel void compiler_sub_group_shuffle(global int *dst, int c) dst[i*4+2] = o2; dst[i*4+3] = o3; } +#ifdef SHORT +__kernel void compiler_sub_group_shuffle_short(global short *dst, int c) +{ + short i = get_global_id(0); + if (i == 0) +dst[0] = get_max_sub_group_size(); + dst++; + + short from = i; + int j = get_max_sub_group_size() - get_sub_group_local_id() - 1; + short o0 = get_sub_group_local_id(); + short o1 = intel_sub_group_shuffle(from, c); + short o2 = intel_sub_group_shuffle(from, 5); + short o3 = intel_sub_group_shuffle(from, j); + dst[i*4] = o0; + dst[i*4+1] = o1; + dst[i*4+2] = o2; + dst[i*4+3] = o3; +} +#endif diff --git a/kernels/compiler_sub_group_shuffle_down.cl b/kernels/compiler_sub_group_shuffle_down.cl index 769fc3f..40bac05 100644 --- a/kernels/compiler_sub_group_shuffle_down.cl +++ b/kernels/compiler_sub_group_shuffle_down.cl @@ -1,4 +1,4 @@ -__kernel void compiler_sub_group_shuffle_down(global int *dst, int c) +__kernel void compiler_sub_group_shuffle_down_int(global int *dst, int c) { int i = get_global_id(0); if (i == 0) @@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_down(global int *dst, int c) dst[i*4+2] = o2; dst[i*4+3] = o3; } +#ifdef SHORT +__kernel void compiler_sub_group_shuffle_down_short(global short *dst, int c) +{ + short i = get_global_id(0); + if (i == 0) +dst[0] = get_max_sub_group_size(); + dst++; + + short from = i; + int j = get_max_sub_group_size() - get_sub_group_local_id() - 1; + int k = get_sub_group_local_id() + 1; + short o0 = intel_sub_group_shuffle_down((short)123, (short)456, c); + short o1 = intel_sub_group_shuffle_down((short)123, from, c); + short o2 = intel_sub_group_shuffle_down(from, (short)-from, k); + short o3 = intel_sub_group_shuffle_down(from, (short)321, j); + dst[i*4] = o0; + dst[i*4+1] = o1; + dst[i*4+2] = o2; + dst[i*4+3] = o3; +} +#endif diff --git a/kernels/compiler_sub_group_shuffle_up.cl b/kernels/compiler_sub_group_shuffle_up.cl index 5c5cee1..fd287d5 100644 --- a/kernels/compiler_sub_group_shuffle_up.cl +++ b/kernels/compiler_sub_group_shuffle_up.cl @@ -1,4 +1,4 @@ -__kernel void compiler_sub_group_shuffle_up(global int *dst, int c) +__kernel void compiler_sub_group_shuffle_up_int(global int *dst, int c) { int i = get_global_id(0); if (i == 0) @@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_up(global int *dst, int c) dst[i*4+2] = o2; dst[i*4+3] = o3; } +#ifdef SHORT +__kernel void compiler_sub_group_shuffle_up_short(global short *dst, int c) +{ + short i = get_global_id(0); + if (i == 0) +dst[0] = get_max_sub_group_size(); + dst++; + + short from = i; + int j = get_sub_group_local_id() + 1; + int k = get_max_sub_group_size() - get_sub_group_local_id() - 1; + short o0 = intel_sub_group_shuffle_up((short)123, (short)456, c); + short o1 = intel_sub_group_shuffle_up((short)123, from, c); + short o2 = intel_sub_group_shuffle_up(from, (short)-from, k); + short o3 = intel_sub_group_shuffle_up(from, (short)321, j); + dst[i*4] = o0; + dst[i*4+1] = o1; + dst[i*4+2] = o2; + dst[i*4+3] = o3; +} +#endif diff --git a/kernels/compiler_sub_group_shuffle_xor.cl b/kernels/compiler_sub_group_shuffle_xor.cl index 8bc15d3..df3dfe7 100644 --- a/kernels/compiler_sub_group_shuffle_xor.cl +++ b/kernels/compiler_sub_group_shuffle_xor.cl @@ -1,4 +1,4 @@ -__kernel void compiler_sub_group_shuffle_xor(global int *dst, int c) +__kernel void compiler_sub_group_shuffle_xor_int(global int *dst, int c) { int i = get_global_id(0); if (i == 0) @@ -17,3 +17,24 @@ __kernel void compiler_sub_group_shuffle_xor(global int *dst, int c) dst[i*4+2] = o2; dst[i*4+3] = o3; } +#ifdef SHORT +__kernel void compiler_sub_group_shuffle_xor_short(global short *dst, int c) +{ + short i = get_global_id(0); + if (i == 0) +dst[0] =
[Beignet] [PATCH 14/14] Utest: Add subgroup block read/write ushort test case
From: Pan XiuliAdd ushort block read/write for buffer and image. Refine uint block read/write with suffix _ui. Signed-off-by: Pan Xiuli --- kernels/compiler_subgroup_buffer_block_read.cl | 47 ++-- kernels/compiler_subgroup_buffer_block_write.cl | 44 +-- kernels/compiler_subgroup_image_block_read.cl | 49 +++-- kernels/compiler_subgroup_image_block_write.cl | 46 +--- utests/compiler_subgroup_buffer_block_read.cpp | 73 +++--- utests/compiler_subgroup_buffer_block_write.cpp | 74 --- utests/compiler_subgroup_image_block_read.cpp | 98 +++-- utests/compiler_subgroup_image_block_write.cpp | 73 +++--- 8 files changed, 412 insertions(+), 92 deletions(-) diff --git a/kernels/compiler_subgroup_buffer_block_read.cl b/kernels/compiler_subgroup_buffer_block_read.cl index 9edaa2e..4cbf894 100644 --- a/kernels/compiler_subgroup_buffer_block_read.cl +++ b/kernels/compiler_subgroup_buffer_block_read.cl @@ -1,31 +1,62 @@ -__kernel void compiler_subgroup_buffer_block_read1(global uint *src, global uint *dst) +__kernel void compiler_subgroup_buffer_block_read_ui1(global uint *src, global uint *dst) { int id = get_global_id(0); global uint * p = src + get_sub_group_id() * get_max_sub_group_size(); - uint tmp = intel_sub_group_block_read(p); + uint tmp = intel_sub_group_block_read_ui(p); dst[id] = tmp; } -__kernel void compiler_subgroup_buffer_block_read2(global uint *src, global uint2 *dst) +__kernel void compiler_subgroup_buffer_block_read_ui2(global uint *src, global uint2 *dst) { int id = get_global_id(0); global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*2; - uint2 tmp = intel_sub_group_block_read2(p); + uint2 tmp = intel_sub_group_block_read_ui2(p); dst[id] = tmp; } -__kernel void compiler_subgroup_buffer_block_read4(global uint *src, global uint4 *dst) +__kernel void compiler_subgroup_buffer_block_read_ui4(global uint *src, global uint4 *dst) { int id = get_global_id(0); global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*4; - uint4 tmp = intel_sub_group_block_read4(p); + uint4 tmp = intel_sub_group_block_read_ui4(p); dst[id] = tmp; } -__kernel void compiler_subgroup_buffer_block_read8(global uint *src, global uint8 *dst) +__kernel void compiler_subgroup_buffer_block_read_ui8(global uint *src, global uint8 *dst) { int id = get_global_id(0); global uint * p = src + get_sub_group_id() * get_max_sub_group_size()*8; - uint8 tmp = intel_sub_group_block_read8(p); + uint8 tmp = intel_sub_group_block_read_ui8(p); dst[id] = tmp; } +#ifdef SHORT +__kernel void compiler_subgroup_buffer_block_read_us1(global ushort *src, global ushort *dst) +{ + int id = get_global_id(0); + global ushort * p = src + get_sub_group_id() * get_max_sub_group_size(); + ushort tmp = intel_sub_group_block_read_us(p); + dst[id] = tmp; +} +__kernel void compiler_subgroup_buffer_block_read_us2(global ushort *src, global ushort2 *dst) +{ + int id = get_global_id(0); + global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*2; + ushort2 tmp = intel_sub_group_block_read_us2(p); + dst[id] = tmp; +} +__kernel void compiler_subgroup_buffer_block_read_us4(global ushort *src, global ushort4 *dst) +{ + int id = get_global_id(0); + global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*4; + ushort4 tmp = intel_sub_group_block_read_us4(p); + dst[id] = tmp; +} + +__kernel void compiler_subgroup_buffer_block_read_us8(global ushort *src, global ushort8 *dst) +{ + int id = get_global_id(0); + global ushort * p = src + get_sub_group_id() * get_max_sub_group_size()*8; + ushort8 tmp = intel_sub_group_block_read_us8(p); + dst[id] = tmp; +} +#endif diff --git a/kernels/compiler_subgroup_buffer_block_write.cl b/kernels/compiler_subgroup_buffer_block_write.cl index f735855..f452dcc 100644 --- a/kernels/compiler_subgroup_buffer_block_write.cl +++ b/kernels/compiler_subgroup_buffer_block_write.cl @@ -1,27 +1,55 @@ -__kernel void compiler_subgroup_buffer_block_write1(global uint *src, global uint *dst) +__kernel void compiler_subgroup_buffer_block_write_ui1(global uint *src, global uint *dst) { int id = get_global_id(0); global uint * p = dst + get_sub_group_id() * get_max_sub_group_size(); - intel_sub_group_block_write(p,src[id]); + intel_sub_group_block_write_ui(p,src[id]); } -__kernel void compiler_subgroup_buffer_block_write2(global uint2 *src, global uint *dst) +__kernel void compiler_subgroup_buffer_block_write_ui2(global uint2 *src, global uint *dst) { int id = get_global_id(0); global uint * p = dst + get_sub_group_id() * get_max_sub_group_size()*2; - intel_sub_group_block_write2(p,src[id]); + intel_sub_group_block_write_ui2(p,src[id]); } -__kernel void compiler_subgroup_buffer_block_write4(global uint4 *src, global uint *dst) +__kernel
[Beignet] [PATCH 11/14] Backend: Add sub groups short shuffle builtin functions
From: Pan XiuliAdd short type sub group shuffle(simd shuffle) Signed-off-by: Pan Xiuli --- backend/src/ir/instruction.cpp | 5 +++-- backend/src/libocl/tmpl/ocl_simd.tmpl.cl | 6 ++ backend/src/libocl/tmpl/ocl_simd.tmpl.h | 11 +++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp index ed64580..08a94cd 100644 --- a/backend/src/ir/instruction.cpp +++ b/backend/src/ir/instruction.cpp @@ -1493,8 +1493,9 @@ namespace ir { INLINE bool SimdShuffleInstruction::wellFormed(const Function , std::string ) const { - if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32 && this->type != TYPE_FLOAT)) { -whyNot = "Only support S32/U32/FLOAT type"; + if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32 && this->type != TYPE_FLOAT && +this->type != TYPE_U16 && this->type != TYPE_S16)) { +whyNot = "Only support S16/U16/S32/U32/FLOAT type"; return false; } diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl index 90c7cc2..9023107 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.cl +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.cl @@ -283,6 +283,8 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_down(TYPE x, TYPE y, uint c) { \ SHUFFLE_DOWN(float) SHUFFLE_DOWN(int) SHUFFLE_DOWN(uint) +SHUFFLE_DOWN(short) +SHUFFLE_DOWN(ushort) #undef SHUFFLE_DOWN #define SHUFFLE_UP(TYPE) \ @@ -296,6 +298,8 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_up(TYPE x, TYPE y, uint c) { \ SHUFFLE_UP(float) SHUFFLE_UP(int) SHUFFLE_UP(uint) +SHUFFLE_UP(short) +SHUFFLE_UP(ushort) #undef SHUFFLE_UP #define SHUFFLE_XOR(TYPE) \ OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \ @@ -304,4 +308,6 @@ OVERLOADABLE TYPE intel_sub_group_shuffle_xor(TYPE x, uint c) { \ SHUFFLE_XOR(float) SHUFFLE_XOR(int) SHUFFLE_XOR(uint) +SHUFFLE_XOR(short) +SHUFFLE_XOR(ushort) #undef SHUFFLE_XOR diff --git a/backend/src/libocl/tmpl/ocl_simd.tmpl.h b/backend/src/libocl/tmpl/ocl_simd.tmpl.h index d0f06d1..158c8e1 100644 --- a/backend/src/libocl/tmpl/ocl_simd.tmpl.h +++ b/backend/src/libocl/tmpl/ocl_simd.tmpl.h @@ -169,15 +169,26 @@ OVERLOADABLE half intel_sub_group_shuffle(half x, uint c); OVERLOADABLE float intel_sub_group_shuffle(float x, uint c); OVERLOADABLE int intel_sub_group_shuffle(int x, uint c); OVERLOADABLE uint intel_sub_group_shuffle(uint x, uint c); +OVERLOADABLE short intel_sub_group_shuffle(short x, uint c); +OVERLOADABLE ushort intel_sub_group_shuffle(ushort x, uint c); + OVERLOADABLE float intel_sub_group_shuffle_down(float x, float y, uint c); OVERLOADABLE int intel_sub_group_shuffle_down(int x, int y, uint c); OVERLOADABLE uint intel_sub_group_shuffle_down(uint x, uint y, uint c); +OVERLOADABLE short intel_sub_group_shuffle_down(short x, short y, uint c); +OVERLOADABLE ushort intel_sub_group_shuffle_down(ushort x, ushort y, uint c); + OVERLOADABLE float intel_sub_group_shuffle_up(float x, float y, uint c); OVERLOADABLE int intel_sub_group_shuffle_up(int x, int y, uint c); OVERLOADABLE uint intel_sub_group_shuffle_up(uint x, uint y, uint c); +OVERLOADABLE short intel_sub_group_shuffle_up(short x, short y, uint c); +OVERLOADABLE ushort intel_sub_group_shuffle_up(ushort x, ushort y, uint c); + OVERLOADABLE float intel_sub_group_shuffle_xor(float x, uint c); OVERLOADABLE int intel_sub_group_shuffle_xor(int x, uint c); OVERLOADABLE uint intel_sub_group_shuffle_xor(uint x, uint c); +OVERLOADABLE short intel_sub_group_shuffle_xor(short x, uint c); +OVERLOADABLE ushort intel_sub_group_shuffle_xor(ushort x, uint c); /* blocak read/write */ OVERLOADABLE uint intel_sub_group_block_read(const global uint* p); -- 2.7.4 ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH 02/14] Backend: Refine GenRegiter::offset
From: Pan XiuliConvert the subnr into nr if the subnr is bigger than 32, this will make offset usage universally. Signed-off-by: Pan Xiuli --- backend/src/backend/gen_register.hpp | 4 1 file changed, 4 insertions(+) diff --git a/backend/src/backend/gen_register.hpp b/backend/src/backend/gen_register.hpp index bbea761..bda35e4 100644 --- a/backend/src/backend/gen_register.hpp +++ b/backend/src/backend/gen_register.hpp @@ -268,6 +268,10 @@ namespace gbe static INLINE GenRegister offset(GenRegister reg, int nr, int subnr = 0) { GenRegister r = reg; + if(subnr >= 32){ +nr += subnr / 32; +subnr = subnr % 32; + } r.nr += nr; r.subnr += subnr; r.subphysical = 1; -- 2.7.4 ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet
[Beignet] [PATCH 07/14] Utest: Add test case for sub group broadcast short
From: Pan XiuliSigned-off-by: Pan Xiuli --- kernels/compiler_subgroup_broadcast.cl | 10 ++ utests/compiler_subgroup_broadcast.cpp | 11 +++ 2 files changed, 21 insertions(+) diff --git a/kernels/compiler_subgroup_broadcast.cl b/kernels/compiler_subgroup_broadcast.cl index 63e9568..3d16d67 100644 --- a/kernels/compiler_subgroup_broadcast.cl +++ b/kernels/compiler_subgroup_broadcast.cl @@ -32,6 +32,16 @@ kernel void compiler_subgroup_broadcast_long(global long *src, long broadcast_val = sub_group_broadcast(val, simd_id); dst[index] = broadcast_val; } +kernel void compiler_subgroup_broadcast_short(global short *src, +global short *dst, +uint simd_id) +{ + uint index = get_global_id(0); + + short val = src[index]; + short broadcast_val = sub_group_broadcast(val, simd_id); + dst[index] = broadcast_val; +} #else #pragma OPENCL EXTENSION cl_khr_fp16 : enable kernel void compiler_subgroup_broadcast_half(global half *src, diff --git a/utests/compiler_subgroup_broadcast.cpp b/utests/compiler_subgroup_broadcast.cpp index 5aa749c..33ec43c 100644 --- a/utests/compiler_subgroup_broadcast.cpp +++ b/utests/compiler_subgroup_broadcast.cpp @@ -190,6 +190,17 @@ void compiler_subgroup_broadcast_long(void) subgroup_generic(input, expected); } MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(compiler_subgroup_broadcast_long); +void compiler_subgroup_broadcast_short(void) +{ + if(!cl_check_subgroups_short()) +return; + cl_short *input = NULL; + cl_short *expected = NULL; + OCL_CREATE_KERNEL_FROM_FILE("compiler_subgroup_broadcast", + "compiler_subgroup_broadcast_short"); + subgroup_generic(input, expected); +} +MAKE_UTEST_FROM_FUNCTION(compiler_subgroup_broadcast_short); void compiler_subgroup_broadcast_half(void) { if(!cl_check_subgroups()) -- 2.7.4 ___ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet