On Jul 18, 2013, at 10:46 PM, Michael McNeil Forbes 
<[email protected]> wrote:

> What is the recommended way of preparing ElementwiseKernel instances for 
> repeated calling on the same GPU arrays for performance?

Here is one attempt… is this headed in the right direction or is there a 
better/safer way?

class PreparedElementwiseKernel(object):
    """Hack to store prepared function for repeated evaluation.
    
    This is just a copy of ElementwiseKernel.__call__ that
    stores the computed values.
    """

    def __init__(self, kernel, *args, **kwargs):
        is_vector_arg = []
        vectors = []

        range_ = kwargs.pop("range", None)
        slice_ = kwargs.pop("slice", None)
        stream = kwargs.pop("stream", None)

        if kwargs:
            raise TypeError("invalid keyword arguments specified: "
                    + ", ".join(kwargs.iterkeys()))

        invocation_args = []
        func, arguments = kernel.generate_stride_kernel_and_types(
                range_ is not None or slice_ is not None)

        for arg, arg_descr in zip(args, arguments):
            if isinstance(arg_descr, pycuda.tools.VectorArg):
                if not arg.flags.forc:
                    raise RuntimeError("elementwise kernel cannot "
                            "deal with non-contiguous arrays")

                vectors.append(arg)
                is_vector_arg.append(True)
            else:
                is_vector_arg.append(False)

        repr_vec = vectors[0]

        if slice_ is not None:
            if range_ is not None:
                raise TypeError("may not specify both range and slice "
                        "keyword arguments")

            range_ = slice(*slice_.indices(repr_vec.size))

        if range_ is not None:
            invocation_args.append(range_.start)
            invocation_args.append(range_.stop)
            if range_.step is None:
                invocation_args.append(1)
            else:
                invocation_args.append(range_.step)

            from pycuda.gpuarray import splay
            grid, block = splay(abs(range_.stop - range_.start)//range_.step)
        else:
            block = repr_vec._block
            grid = repr_vec._grid
            invocation_args.append(repr_vec.mem_size)

        self.func = func
        self.grid = grid
        self.block = block
        self.stream = stream
        self.is_vector_arg = is_vector_arg
        self.extra_invocation_args = invocation_args

    def __call__(self, *args):
        invocation_args = [_arg.gpudata if self.is_vector_arg[_n]
                           else _arg 
                           for _n, _arg in enumerate(args)] 
        invocation_args.extend(self.extra_invocation_args)
        self.func.prepared_async_call(self.grid, self.block, self.stream,
                                      *invocation_args)
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda

Reply via email to