Hi everybody. I have again a problem on my Nvidia graphics cards an pyopencl. I wrote a simple kernel, that computes the (pixelwise) median
of an image and outputs that to another image. I also wrote an easy event visualiser in python's matplotlib to have an idea about execution time. I have a simple setup, where I have one context on one device with two queues. I enqueue my copies to one queue and my kernel execution to the other queue, where the copies are associated with an event for the kernel to wait for. What I expect is, that when I repeat this process several times, the copies should be executed in parallel to the kernel execution on the other queue. What I see is, that during kernel execution, there is no parallel work on the other queue. Is this a problem of my code or of Nvidias implementation? I tried this on the following setups: TITAN Xp on NVIDIA CUDA (driver version 384.81) Tesla K10.G2.8GB on NVIDIA CUDA (driver version 375.39) GeForce GTX TITAN on NVIDIA CUDA (driver version 384.81) GeForce GTX TITAN on NVIDIA CUDA (driver version 375.66) Here's an image showing the timeline of the execution of a median on a numpy array with (512,512,512) size and a median window of 7. [cid:d776dbf2-e974-4194-a929-5b12290956de] All code is attached. To run it the tested cards have to be able to write to images and for the profiling visualisation pyplot.matplotlib is needed. Any help is very welcome, especially if this behaviour can't be reproduced on other (probably non-NVIDIA) setups. Best Jonathan Schock
median_kernels.cl
Description: median_kernels.cl
# -*- coding: utf-8 -*- import pyopencl as cl import numpy as np import matplotlib.pyplot as plt Mf = cl.mem_flags M_DIC = {0: Mf.READ_ONLY, 1: Mf.WRITE_ONLY, 2: Mf.READ_WRITE} EVENT_LIST = [] PLATFORM = cl.get_platforms()[0] DEVS = PLATFORM.get_devices() DEVICE_1 = DEVS[0] IMAGE_DIMENSIONS = [0, 0] IMAGE_DIMENSIONS[0] = int(DEVICE_1.image3d_max_height) IMAGE_DIMENSIONS[1] = int(DEVICE_1.image3d_max_width) CTX = cl.Context([DEVICE_1, ]) Queue1 = cl.CommandQueue( CTX, DEVICE_1, properties=cl.command_queue_properties.PROFILING_ENABLE) Queue2 = cl.CommandQueue( CTX, DEVICE_1, properties=cl.command_queue_properties.PROFILING_ENABLE) def p_image(queue_s, host, name, rw=2, return_event=False, blocking=True): global EVENT_LIST device = cl.Image(CTX, M_DIC[rw], cl.ImageFormat(cl.channel_order.INTENSITY, cl.channel_type.FLOAT), host.shape[::-1]) event = cl.enqueue_copy(eval(queue_s), device, host, is_blocking=blocking, origin=(0, 0, 0), region=host.shape[::-1]) EVENT_LIST.append((event, queue_s, name)) if return_event: return device, event else: return device def p_kernel(queue_s, kernel, size, args=(), wait=[], return_event=False): q = eval(queue_s) global EVENT_LIST kernel.set_args(*args) event = cl.enqueue_nd_range_kernel(q, kernel, size, None, wait_for=wait) EVENT_LIST.append((event, queue_s, kernel.function_name)) if return_event: return event f = open('median_kernels.cl', 'r') fstr = "".join(f.readlines()) prg = cl.Program(CTX, fstr).build() general_median_knl = prg.median general_median_knl.set_scalar_arg_dtypes([None, None, np.int32]) def median(input_array, size): if not input_array.flags.c_contiguous: raise TypeError('Array is not C-contiguous') x, y, z = DEVICE_1.max_work_item_sizes a_z, a_y, a_x = input_array.shape if not (a_x <= x and a_y <= y): raise TypeError('The input array width or height exceeds the maximum work item size') if x>IMAGE_DIMENSIONS[0] or y>IMAGE_DIMENSIONS[1]: raise TypeError('Image exceeds the maximum device image size') overlap = a_z % int(z) runs = (a_z-overlap)/int(z) if size%2 ==0: raise TypeError('Size must be odd') h_out = np.empty((z, a_y, a_x)).astype(np.float32) for i in range(runs): d_image, e1 = p_image('Queue1', input_array, 'Copy_in', return_event=True, rw=0, blocking=False) d_out, e2 = p_image('Queue1', h_out, 'Copy_out', return_event=True, rw=1, blocking=False) median_e = p_kernel('Queue2', general_median_knl, (a_x, a_y, z), (d_image, d_out, size), return_event=True, wait=[e1, e2]) cl.enqueue_copy(Queue1, h_out, d_out, wait_for=[median_e, ], origin=(0, 0, 0), region=h_out.shape[::-1]) def profile_events(event_list): queue_names = set() for e in event_list: queue_names.add(e[1]) queue_coding = {} c = 1 mid = [] for name in queue_names: queue_coding[name]=c c+=1 mid.append((c-2)*0.5+0.25) plt.figure() colorcoding ={1:'aqua',2:'greenyellow',3:'b',4:'r'} start = 1e-9*event_list[0][0].profile.start plt.plot([0,0,0],[0,0.2,0.3]) max_r=[] for e in event_list: y_v = queue_coding[e[1]] c = colorcoding[y_v] l = (1e-9 * e[0].profile.start)-start w = 1e-9*(e[0].profile.end-e[0].profile.start) max_r.append(w+l) plt.bar(l,0.5,w,(y_v-1)*0.5,color=c,alpha=0.5, align='edge', edgecolor='black') # plt.text(l+0.1*w,((y_v-1)*0.5)+0.25,e[2]+' = '+str(int(w*1000))+' ms',fontsize=12,rotation=90,horizontalalignment='left',verticalalignment='center') plt.text(l+0.1*w,((y_v-1)*0.5)+0.25,e[2]+' = '+str(int(w*1000))+' ms',fontsize=12,rotation=90,horizontalalignment='left',verticalalignment='center') plt.axhline(y=0.5*y_v,color='black') plt.yticks(mid, queue_coding.keys()) plt.axis([0,max(max_r),0,len(queue_names)*0.5]) def main(): test_data = np.ones((512,512,512), dtype = np.float32) median(test_data, 7) profile_events(EVENT_LIST) main()
_______________________________________________ PyOpenCL mailing list PyOpenCL@tiker.net https://lists.tiker.net/listinfo/pyopencl