On Jul 18, 2013, at 10:46 PM, Michael McNeil Forbes
<[email protected]> wrote:
> What is the recommended way of preparing ElementwiseKernel instances for
> repeated calling on the same GPU arrays for performance?
Here is one attempt… is this headed in the right direction or is there a
better/safer way?
class PreparedElementwiseKernel(object):
"""Hack to store prepared function for repeated evaluation.
This is just a copy of ElementwiseKernel.__call__ that
stores the computed values.
"""
def __init__(self, kernel, *args, **kwargs):
is_vector_arg = []
vectors = []
range_ = kwargs.pop("range", None)
slice_ = kwargs.pop("slice", None)
stream = kwargs.pop("stream", None)
if kwargs:
raise TypeError("invalid keyword arguments specified: "
+ ", ".join(kwargs.iterkeys()))
invocation_args = []
func, arguments = kernel.generate_stride_kernel_and_types(
range_ is not None or slice_ is not None)
for arg, arg_descr in zip(args, arguments):
if isinstance(arg_descr, pycuda.tools.VectorArg):
if not arg.flags.forc:
raise RuntimeError("elementwise kernel cannot "
"deal with non-contiguous arrays")
vectors.append(arg)
is_vector_arg.append(True)
else:
is_vector_arg.append(False)
repr_vec = vectors[0]
if slice_ is not None:
if range_ is not None:
raise TypeError("may not specify both range and slice "
"keyword arguments")
range_ = slice(*slice_.indices(repr_vec.size))
if range_ is not None:
invocation_args.append(range_.start)
invocation_args.append(range_.stop)
if range_.step is None:
invocation_args.append(1)
else:
invocation_args.append(range_.step)
from pycuda.gpuarray import splay
grid, block = splay(abs(range_.stop - range_.start)//range_.step)
else:
block = repr_vec._block
grid = repr_vec._grid
invocation_args.append(repr_vec.mem_size)
self.func = func
self.grid = grid
self.block = block
self.stream = stream
self.is_vector_arg = is_vector_arg
self.extra_invocation_args = invocation_args
def __call__(self, *args):
invocation_args = [_arg.gpudata if self.is_vector_arg[_n]
else _arg
for _n, _arg in enumerate(args)]
invocation_args.extend(self.extra_invocation_args)
self.func.prepared_async_call(self.grid, self.block, self.stream,
*invocation_args)
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda