Signed-off-by: Grigore Lupescu <grigore.lupe...@intel.com> --- benchmark/benchmark_workgroup_reduce.cpp | 157 +++++++++++++++++++++++++------ kernels/bench_workgroup_reduce.cl | 32 ++++++- 2 files changed, 160 insertions(+), 29 deletions(-)
diff --git a/benchmark/benchmark_workgroup_reduce.cpp b/benchmark/benchmark_workgroup_reduce.cpp index 815b6b5..c93ef26 100644 --- a/benchmark/benchmark_workgroup_reduce.cpp +++ b/benchmark/benchmark_workgroup_reduce.cpp @@ -9,30 +9,30 @@ double benchmark_workgroup_add_uint(void) { double elapsed = 0; struct timeval start,stop; - const size_t set_size = 512 * 256; - const size_t set_local_size = 64; + const size_t global_size = 512 * 256; + const size_t local_size = 128; const uint32_t reduce_loop = 10000; /* Input set will be generated */ - uint32_t* src = (uint32_t*)calloc(sizeof(uint32_t), set_size); + uint32_t* src = (uint32_t*)calloc(sizeof(uint32_t), global_size); OCL_ASSERT(src != NULL); - for(uint32_t i = 0; i < set_size; i++){ - src[i] = 1; + for(uint32_t i = 0; i < global_size; i++){ + src[i] = (i / local_size); } /* Setup kernel and buffers */ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce", "bench_workgroup_reduce_add_uint"); - OCL_CREATE_BUFFER(buf[0], 0, (set_size) * sizeof(uint32_t), NULL); - OCL_CREATE_BUFFER(buf[1], 0, (set_size) * sizeof(uint32_t), NULL); + OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(uint32_t), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(uint32_t), NULL); OCL_MAP_BUFFER(0); - memcpy(buf_data[0], src, set_size * sizeof(uint32_t)); + memcpy(buf_data[0], src, global_size * sizeof(uint32_t)); OCL_UNMAP_BUFFER(0); - globals[0] = set_size; - locals[0] = set_local_size; + globals[0] = global_size; + locals[0] = local_size; OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); @@ -47,43 +47,95 @@ double benchmark_workgroup_add_uint(void) /* Check results */ OCL_MAP_BUFFER(1); - for(uint32_t i = 1; i < set_size; i += set_size){ + for(uint32_t i = 0; i < global_size; i += local_size){ //printf(" %u", ((uint32_t*)buf_data[1])[i]); - OCL_ASSERT(((uint32_t*)buf_data[1])[i] == set_local_size); + OCL_ASSERT( ((uint32_t*)buf_data[1])[i] == i ); } OCL_UNMAP_BUFFER(1); - return BANDWIDTH(set_size * reduce_loop, elapsed); + return BANDWIDTH(global_size * reduce_loop, elapsed); } MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_uint, "M/sec"); +double benchmark_workgroup_min_uint(void) +{ + double elapsed = 0; + struct timeval start,stop; + const size_t global_size = 512 * 256; + const size_t local_size = 128; + const uint32_t reduce_loop = 10000; + + /* Input set will be generated */ + uint32_t* src = (uint32_t*)calloc(sizeof(uint32_t), global_size); + OCL_ASSERT(src != NULL); + for(uint32_t i = 0; i < global_size; i++){ + src[i] = i; + } + + /* Setup kernel and buffers */ + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce", + "bench_workgroup_reduce_min_uint"); + + OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(uint32_t), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(uint32_t), NULL); + + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], src, global_size * sizeof(uint32_t)); + OCL_UNMAP_BUFFER(0); + + globals[0] = global_size; + locals[0] = local_size; + + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop); + + /* Measure performance */ + gettimeofday(&start,0); + OCL_NDRANGE(1); + clFinish(queue); + gettimeofday(&stop,0); + elapsed = time_subtract(&stop, &start, 0); + + /* Check results */ + OCL_MAP_BUFFER(1); + for(uint32_t i = local_size/2; i < global_size; i += local_size){ + //printf(" %u", ((uint32_t*)buf_data[1])[i]); + OCL_ASSERT( ((uint32_t*)buf_data[1])[i] == (src[i] - (local_size / 2)) ); + } + OCL_UNMAP_BUFFER(1); + + return BANDWIDTH(global_size * reduce_loop, elapsed); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_min_uint, "M/sec"); + double benchmark_workgroup_add_float(void) { double elapsed = 0; struct timeval start,stop; - const size_t set_size = 512 * 256; - const size_t set_local_size = 64; + const size_t global_size = 512 * 256; + const size_t local_size = 128; const uint32_t reduce_loop = 10000; /* Input set will be generated */ - float* src = (float*)calloc(sizeof(float), set_size); + float* src = (float*)calloc(sizeof(float), global_size); OCL_ASSERT(src != NULL); - for(uint32_t i = 0; i < set_size; i++) - src[i] = 1.0f; + for(uint32_t i = 0; i < global_size; i++) + src[i] = (i / local_size); /* Setup kernel and buffers */ OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce", "bench_workgroup_reduce_add_float"); - OCL_CREATE_BUFFER(buf[0], 0, (set_size) * sizeof(float), NULL); - OCL_CREATE_BUFFER(buf[1], 0, (set_size) * sizeof(float), NULL); + OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL); OCL_MAP_BUFFER(0); - memcpy(buf_data[0], src, set_size * sizeof(float)); + memcpy(buf_data[0], src, global_size * sizeof(float)); OCL_UNMAP_BUFFER(0); - globals[0] = set_size; - locals[0] = set_local_size; + globals[0] = global_size; + locals[0] = local_size; OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); @@ -98,12 +150,63 @@ double benchmark_workgroup_add_float(void) /* Check results */ OCL_MAP_BUFFER(1); - for(uint32_t i = 1; i < set_size; i += set_size){ - //printf("%f ", ((float*)buf_data[1])[i]); - OCL_ASSERT(((float*)buf_data[1])[i] == set_local_size); + for(uint32_t i = 0; i < global_size; i += local_size){ + //printf(" %f", ((float*)buf_data[1])[i]); + OCL_ASSERT( ((float*)buf_data[1])[i] == (float)i ); } OCL_UNMAP_BUFFER(1); - return BANDWIDTH(set_size * reduce_loop, elapsed); + return BANDWIDTH(global_size * reduce_loop, elapsed); } MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_add_float, "M/sec"); + +double benchmark_workgroup_min_float(void) +{ + double elapsed = 0; + struct timeval start,stop; + const size_t global_size = 512 * 256; + const size_t local_size = 128; + const uint32_t reduce_loop = 10000; + + /* Input set will be generated */ + float* src = (float*)calloc(sizeof(float), global_size); + OCL_ASSERT(src != NULL); + for(uint32_t i = 0; i < global_size; i++) + src[i] = 1.0f * i + 1; + + /* Setup kernel and buffers */ + OCL_CREATE_KERNEL_FROM_FILE("bench_workgroup_reduce", + "bench_workgroup_reduce_min_float"); + + OCL_CREATE_BUFFER(buf[0], 0, (global_size) * sizeof(float), NULL); + OCL_CREATE_BUFFER(buf[1], 0, (global_size) * sizeof(float), NULL); + + OCL_MAP_BUFFER(0); + memcpy(buf_data[0], src, global_size * sizeof(float)); + OCL_UNMAP_BUFFER(0); + + globals[0] = global_size; + locals[0] = local_size; + + OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]); + OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]); + OCL_SET_ARG(2, sizeof(cl_uint), &reduce_loop); + + /* Measure performance */ + gettimeofday(&start,0); + OCL_NDRANGE(1); + clFinish(queue); + gettimeofday(&stop,0); + elapsed = time_subtract(&stop, &start, 0); + + /* Check results */ + OCL_MAP_BUFFER(1); + for(uint32_t i = local_size/2; i < global_size; i += local_size){ + //printf(" %f", ((float*)buf_data[1])[i]); + OCL_ASSERT( ((float*)buf_data[1])[i] == (src[i] - (local_size / 2)) ); + } + OCL_UNMAP_BUFFER(1); + + return BANDWIDTH(global_size * reduce_loop, elapsed); +} +MAKE_BENCHMARK_FROM_FUNCTION(benchmark_workgroup_min_float, "M/sec"); diff --git a/kernels/bench_workgroup_reduce.cl b/kernels/bench_workgroup_reduce.cl index 9e2f848..ba1c709 100644 --- a/kernels/bench_workgroup_reduce.cl +++ b/kernels/bench_workgroup_reduce.cl @@ -3,7 +3,7 @@ kernel void bench_workgroup_reduce_add_uint( global uint *dst, uint reduce_loop) { - uint val = src[get_local_id(0)]; + uint val = src[get_global_id(0)]; uint sum = work_group_reduce_add(val); for(; reduce_loop > 0; reduce_loop--) @@ -12,12 +12,26 @@ kernel void bench_workgroup_reduce_add_uint( dst[get_global_id(0)] = sum; } +kernel void bench_workgroup_reduce_min_uint( + global uint *src, + global uint *dst, + uint reduce_loop) +{ + uint val = src[get_global_id(0)]; + uint min = work_group_reduce_min(val); + + for(; reduce_loop > 0; reduce_loop--) + min = work_group_reduce_min(val); + + dst[get_global_id(0)] = min; +} + kernel void bench_workgroup_reduce_add_float( global float *src, global float *dst, uint reduce_loop) { - float val = src[get_local_id(0)]; + float val = src[get_global_id(0)]; float sum = work_group_reduce_add(val); for(; reduce_loop > 0; reduce_loop--) @@ -25,3 +39,17 @@ kernel void bench_workgroup_reduce_add_float( dst[get_global_id(0)] = sum; } + +kernel void bench_workgroup_reduce_min_float( + global float *src, + global float *dst, + uint reduce_loop) +{ + float val = src[get_global_id(0)]; + float min = work_group_reduce_min(val); + + for(; reduce_loop > 0; reduce_loop--) + min = work_group_reduce_min(val); + + dst[get_global_id(0)] = min; +} -- 2.5.0 _______________________________________________ Beignet mailing list Beignet@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/beignet