Hi everybody.

I have again a problem on my Nvidia graphics cards an pyopencl. I wrote a 
simple kernel, that computes the (pixelwise) median

of an image and outputs that to another image. I also wrote an easy event 
visualiser in python's matplotlib

to have an idea about execution time.

I have a simple setup, where I have one context on one device with two queues.

I enqueue my copies to one queue and my kernel execution to the other queue, 
where

the copies are associated with an event for the kernel to wait for. What I 
expect is,

that when I repeat this process several times, the copies should be executed in 
parallel

to the kernel execution on the other queue. What I see is, that during kernel 
execution,

there is no parallel work on the other queue.

Is this a problem of my code or of Nvidias implementation?


I tried this on the following setups:


TITAN Xp on NVIDIA CUDA (driver version 384.81)

Tesla K10.G2.8GB on NVIDIA CUDA (driver version 375.39)

GeForce GTX TITAN on NVIDIA CUDA (driver version 384.81)

GeForce GTX TITAN on NVIDIA CUDA (driver version 375.66)


Here's an image showing the timeline of the execution of a median

on a numpy array with (512,512,512) size and a median window of 7.

[cid:d776dbf2-e974-4194-a929-5b12290956de]

All code is attached. To run it the tested cards have to be able to write to 
images and for the profiling

visualisation pyplot.matplotlib is needed.

Any help is very welcome, especially if this behaviour can't be reproduced on 
other (probably non-NVIDIA) setups.


Best

Jonathan Schock

Attachment: median_kernels.cl
Description: median_kernels.cl

# -*- coding: utf-8 -*-


import pyopencl as cl
import numpy as np
import matplotlib.pyplot as plt


Mf = cl.mem_flags
M_DIC = {0: Mf.READ_ONLY, 1: Mf.WRITE_ONLY, 2: Mf.READ_WRITE}

EVENT_LIST = []




PLATFORM = cl.get_platforms()[0]
DEVS = PLATFORM.get_devices()
DEVICE_1 = DEVS[0]

IMAGE_DIMENSIONS = [0, 0]
IMAGE_DIMENSIONS[0] = int(DEVICE_1.image3d_max_height)
IMAGE_DIMENSIONS[1] = int(DEVICE_1.image3d_max_width)



CTX = cl.Context([DEVICE_1, ])
Queue1 = cl.CommandQueue(
    CTX, DEVICE_1, properties=cl.command_queue_properties.PROFILING_ENABLE)
Queue2 = cl.CommandQueue(
    CTX, DEVICE_1, properties=cl.command_queue_properties.PROFILING_ENABLE)



def p_image(queue_s, host, name, rw=2, return_event=False, blocking=True):
    global EVENT_LIST
    device = cl.Image(CTX, M_DIC[rw],
                      cl.ImageFormat(cl.channel_order.INTENSITY,
                                     cl.channel_type.FLOAT),
                      host.shape[::-1])
    event = cl.enqueue_copy(eval(queue_s), device, host,
                            is_blocking=blocking,
                            origin=(0, 0, 0),
                            region=host.shape[::-1])
    EVENT_LIST.append((event, queue_s, name))
    if return_event:
        return device, event
    else:
        return device


def p_kernel(queue_s, kernel, size, args=(), wait=[], return_event=False):
    q = eval(queue_s)
    global EVENT_LIST
    kernel.set_args(*args)
    event = cl.enqueue_nd_range_kernel(q, kernel, size, None, wait_for=wait)
    EVENT_LIST.append((event, queue_s, kernel.function_name))
    if return_event:
        return event


f = open('median_kernels.cl', 'r')
fstr = "".join(f.readlines())
prg = cl.Program(CTX, fstr).build()
general_median_knl = prg.median
general_median_knl.set_scalar_arg_dtypes([None, None, np.int32])


def median(input_array, size):
    if not input_array.flags.c_contiguous:
        raise TypeError('Array is not C-contiguous')
    x, y, z = DEVICE_1.max_work_item_sizes
    a_z, a_y, a_x = input_array.shape
    if not (a_x <= x and a_y <= y):
        raise TypeError('The input array width or height exceeds the maximum work item size')
    if x>IMAGE_DIMENSIONS[0] or y>IMAGE_DIMENSIONS[1]:
        raise TypeError('Image exceeds the maximum device image size')
    overlap = a_z % int(z)
    runs = (a_z-overlap)/int(z)
    if size%2 ==0:
        raise TypeError('Size must be odd')
    h_out = np.empty((z, a_y, a_x)).astype(np.float32)
    for i in range(runs):
        d_image, e1 = p_image('Queue1', input_array, 'Copy_in', return_event=True, rw=0, blocking=False)
        d_out, e2 = p_image('Queue1', h_out, 'Copy_out', return_event=True, rw=1, blocking=False)
        median_e = p_kernel('Queue2', general_median_knl, (a_x, a_y, z),
                        (d_image, d_out, size), return_event=True, wait=[e1, e2])
        cl.enqueue_copy(Queue1, h_out, d_out, wait_for=[median_e, ], origin=(0, 0, 0),
                        region=h_out.shape[::-1])


def profile_events(event_list):
    queue_names = set()
    for e in event_list:
        queue_names.add(e[1])
    queue_coding = {}
    c = 1
    mid = []
    for name in queue_names:
        queue_coding[name]=c
        c+=1
        mid.append((c-2)*0.5+0.25)

    
    plt.figure()
    colorcoding ={1:'aqua',2:'greenyellow',3:'b',4:'r'}
    start = 1e-9*event_list[0][0].profile.start
    plt.plot([0,0,0],[0,0.2,0.3])
    max_r=[]
    for e in event_list:
        y_v = queue_coding[e[1]]
        c = colorcoding[y_v]
        l = (1e-9 * e[0].profile.start)-start
        w =  1e-9*(e[0].profile.end-e[0].profile.start)
        max_r.append(w+l)
        plt.bar(l,0.5,w,(y_v-1)*0.5,color=c,alpha=0.5, align='edge', edgecolor='black')
#        plt.text(l+0.1*w,((y_v-1)*0.5)+0.25,e[2]+' = '+str(int(w*1000))+' ms',fontsize=12,rotation=90,horizontalalignment='left',verticalalignment='center')
        plt.text(l+0.1*w,((y_v-1)*0.5)+0.25,e[2]+' = '+str(int(w*1000))+' ms',fontsize=12,rotation=90,horizontalalignment='left',verticalalignment='center')

        plt.axhline(y=0.5*y_v,color='black')

   
    plt.yticks(mid, queue_coding.keys())
    plt.axis([0,max(max_r),0,len(queue_names)*0.5])


def main():
    test_data = np.ones((512,512,512), dtype = np.float32)
    median(test_data, 7)
    profile_events(EVENT_LIST)
    
main()
_______________________________________________
PyOpenCL mailing list
PyOpenCL@tiker.net
https://lists.tiker.net/listinfo/pyopencl

Reply via email to