Signed-off-by: Grigore Lupescu <grigore.lupe...@intel.com> --- benchmark/CMakeLists.txt | 3 +- benchmark/benchmark_workgroup_functions.cpp | 176 ++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 1 deletion(-) create mode 100644 benchmark/benchmark_workgroup_functions.cpp
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index dd33829..fd7fd7d 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -18,7 +18,8 @@ set (benchmark_sources benchmark_copy_buffer_to_image.cpp benchmark_copy_image_to_buffer.cpp benchmark_copy_buffer.cpp - benchmark_copy_image.cpp) + benchmark_copy_image.cpp + benchmark_workgroup_functions.cpp) SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}") diff --git a/benchmark/benchmark_workgroup_functions.cpp b/benchmark/benchmark_workgroup_functions.cpp new file mode 100644 index 0000000..81403a0 --- /dev/null +++ b/benchmark/benchmark_workgroup_functions.cpp @@ -0,0 +1,176 @@ +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include <iostream> +#include "utest_helper.hpp" +#include <sys/time.h> + +double benchmark_workgroup_add_uint(void) +{ + cl_int ret; + struct timeval start,stop; + const size_t set_size = 256; + const size_t set_num = set_size * set_size; + size_t set_num_work = set_num; + uint32_t* src = NULL; /* input set will be generated */ + + cl_mem sub_buf_in; + cl_mem sub_buf_out; + cl_buffer_region buf_region_in; + cl_buffer_region buf_region_out; + + buf_region_in.size = set_size * sizeof(uint32_t); + buf_region_in.origin = 0; + buf_region_out.size = set_size * sizeof(uint32_t); + buf_region_out.origin = 0; + + /* Each set is of the form (1, 0, 0, ..0) */ + src = (uint32_t*)calloc(sizeof(uint32_t), set_num * set_size); + OCL_ASSERT(src != NULL); + for(uint32_t i = 0; i < set_num * set_size; i++) + if((i % set_size) == 0) + src[i] = 1; + + /* Setup kernel and buffers */ + OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", + "compiler_workgroup_reduce_add_uint"); + OCL_CREATE_BUFFER(buf[0], 0, (set_num * set_size) * sizeof(uint32_t), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (set_num * set_size) * sizeof(uint32_t), NULL); + + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], src, set_num* set_size * sizeof(uint32_t)); + OCL_UNMAP_BUFFER(0); + + globals[0] = set_size; + locals[0] = set_size; + + /* Measure performance */ + gettimeofday(&start,0); + while(set_num_work > 0){ + /* Perform reductions, subBuffers with offsets */ + for(uint32_t i = 0; i < set_num; i++){ + sub_buf_in = clCreateSubBuffer(buf[0], 0, + CL_BUFFER_CREATE_TYPE_REGION, &buf_region_in, &ret); + OCL_ASSERT(ret == 0); + sub_buf_out = clCreateSubBuffer(buf[1], 0, + CL_BUFFER_CREATE_TYPE_REGION, &buf_region_out, &ret); + OCL_ASSERT(ret == 0); + + OCL_SET_ARG(0, sizeof(cl_mem), &sub_buf_in); + OCL_SET_ARG(1, sizeof(cl_mem), &sub_buf_out); + OCL_NDRANGE(1); + + buf_region_in.origin += set_size * sizeof(uint32_t); + buf_region_out.origin += set_size * sizeof(uint32_t); + } + /* Prepare memory for next set of reductions */ + OCL_MAP_BUFFER(0); + OCL_MAP_BUFFER(1); + for (uint32_t i = 0; i < set_num_work; i++) { + ((uint32_t *)buf_data[0])[i] = + ((uint32_t *)buf_data[1])[i * set_size]; + } + OCL_UNMAP_BUFFER(0); + OCL_UNMAP_BUFFER(1); + + set_num_work /= set_size; + buf_region_in.origin = 0; + buf_region_out.origin = 0; + } + gettimeofday(&stop,0); + double elapsed = time_subtract(&stop, &start, 0); + + /* Check result, final sum */ + OCL_MAP_BUFFER(1); + //printf("%u ", ((uint32_t *)buf_data[1])[0]); + OCL_ASSERT(((uint32_t *)buf_data[1])[0] == set_num); + OCL_UNMAP_BUFFER(1); + + return BANDWIDTH(set_num * set_size * sizeof(uint32_t) * 100, elapsed); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_uint, "Mops/S"); + +double benchmark_workgroup_add_float(void) +{ + cl_int ret; + struct timeval start,stop; + const size_t set_size = 256; + const size_t set_num = set_size * set_size; + size_t set_num_work = set_num; + float* src = NULL; /* input set will be generated */ + + cl_mem sub_buf_in; + cl_mem sub_buf_out; + cl_buffer_region buf_region_in; + cl_buffer_region buf_region_out; + + buf_region_in.size = set_size * sizeof(float); + buf_region_in.origin = 0; + buf_region_out.size = set_size * sizeof(float); + buf_region_out.origin = 0; + + /* Each set is of the form (1, 0, 0, ..0) */ + src = (float*)calloc(sizeof(float), set_num * set_size); + OCL_ASSERT(src != NULL); + for(uint32_t i = 0; i < set_num * set_size; i++) + if((i % set_size) == 0) + src[i] = 1; + + /* Setup kernel and buffers */ + OCL_CREATE_KERNEL_FROM_FILE("compiler_workgroup_reduce", + "compiler_workgroup_reduce_add_float"); + OCL_CREATE_BUFFER(buf[0], 0, (set_num * set_size) * sizeof(float), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (set_num * set_size) * sizeof(float), NULL); + + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], src, set_num* set_size * sizeof(float)); + OCL_UNMAP_BUFFER(0); + + globals[0] = set_size; + locals[0] = set_size; + + /* Measure performance */ + gettimeofday(&start,0); + while(set_num_work > 0){ + /* Perform reductions, subBuffers with offsets */ + for(uint32_t i = 0; i < set_num; i++){ + sub_buf_in = clCreateSubBuffer(buf[0], 0, + CL_BUFFER_CREATE_TYPE_REGION, &buf_region_in, &ret); + OCL_ASSERT(ret == 0); + sub_buf_out = clCreateSubBuffer(buf[1], 0, + CL_BUFFER_CREATE_TYPE_REGION, &buf_region_out, &ret); + OCL_ASSERT(ret == 0); + + OCL_SET_ARG(0, sizeof(cl_mem), &sub_buf_in); + OCL_SET_ARG(1, sizeof(cl_mem), &sub_buf_out); + OCL_NDRANGE(1); + + buf_region_in.origin += set_size * sizeof(float); + buf_region_out.origin += set_size * sizeof(float); + } + /* Prepare memory for next set of reductions */ + OCL_MAP_BUFFER(0); + OCL_MAP_BUFFER(1); + for (uint32_t i = 0; i < set_num_work; i++) { + ((float *)buf_data[0])[i] = + ((float *)buf_data[1])[i * set_size]; + } + OCL_UNMAP_BUFFER(0); + OCL_UNMAP_BUFFER(1); + + set_num_work /= set_size; + buf_region_in.origin = 0; + buf_region_out.origin = 0; + } + gettimeofday(&stop,0); + double elapsed = time_subtract(&stop, &start, 0); + + /* Check result, final sum */ + OCL_MAP_BUFFER(1); + //printf("%f ", ((float *)buf_data[1])[0]); + OCL_ASSERT(((float *)buf_data[1])[0] == set_num); + OCL_UNMAP_BUFFER(1); + + return BANDWIDTH(set_num * set_size * sizeof(float) * 100, elapsed); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_float, "Mflops/S"); -- 2.1.4 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org http://lists.freedesktop.org/mailman/listinfo/beignet