Dear PyCuda community, First of all I would like to introduce myself: I am a scientific developer and I am pretty new to PyCuda (even if I followed a CUDA course). I would like to port part of a very big application to GPU, switching from FFTw to scikit.cuda (cu_fft part). This was straight forward, thanks to the very good abstraction done in PyCuda. I got already speed-up of 5x with exactly the same result compared to fftw.
My problems starts when integrating the code into python-threads; indeed the large application will make all PyCuda calls from different threads and ends with memory leaks on the GPU crashing after a couple of minutes. So I need to enforce all python threads to use the same context on the GPU. I have another question: why is the data1_gpu.ptr changing whereas data2_gpu.ptr and plan fixed (as expected in my code ? Thanks for your help. Cheers -- Jérôme Kieffer Data analysis unit - ESRF
#!/usr//bin/python #coding: utf8 from __future__ import with_statement __author__ = "Jérôme Kieffer" __contact__ = "[email protected]" __license__ = "GPLv3+" __copyright__ = "2011, ESRF, Grenoble" __date__ = "20120112" __doc__ = "This is a python module to measure image offsets using pyfftw3 or fftpack" import os, threading, time, gc try: import fftw3 except ImportError: fftw3 = None try: import pycuda import pycuda.autoinit import pycuda.elementwise import pycuda.gpuarray as gpuarray import scikits.cuda.fft as cu_fft except ImportError: cu_fft = None import numpy class CudaCorrelate(object): plans = {} data1_gpus = {} data2_gpus = {} multconj = None ctx = None sem = threading.Semaphore() def __init__(self, shape): self.shape = tuple(shape) def init(self): with self.__class__.sem: if self.ctx is None: self.__class__.ctx = pycuda.autoinit.context self.ctx.push() if self.shape not in self.__class__.plans: print "Single exec plan" self.__class__.plans[self.shape] = cu_fft.Plan(self.shape, numpy.complex128, numpy.complex128) if self.shape not in self.__class__.data1_gpus: print "Single exec data1" self.__class__.data1_gpus[self.shape] = gpuarray.empty(self.shape, numpy.complex128) if self.shape not in self.__class__.data2_gpus: print "Single exec data2" self.__class__.data2_gpus[self.shape] = gpuarray.empty(self.shape, numpy.complex128) if not self.__class__.multconj: self.__class__.multconj = pycuda.elementwise.ElementwiseKernel("pycuda::complex<double> *a, pycuda::complex<double> *b", "a[i]*=conj(b[i])") self.ctx.synchronize() # self.ctx.detach() self.ctx.pop() def correlate(self, data1, data2): self.init() with self.__class__.sem: self.ctx.push() plan = self.__class__.plans[self.shape] data1_gpu = self.__class__.data1_gpus[self.shape] data2_gpu = self.__class__.data2_gpus[self.shape] print data1_gpu.ptr, data2_gpu.ptr, plan data1_gpu.set(data1.astype(numpy.complex128)) cu_fft.fft(data1_gpu, data1_gpu, plan) print data1_gpu.ptr, data2_gpu.ptr, plan data2_gpu.set(data2.astype(numpy.complex128)) cu_fft.fft(data2_gpu, data2_gpu, plan) # data1_gpu *= data2_gpu.conj() self.multconj(data1_gpu, data2_gpu) cu_fft.ifft(data1_gpu, data1_gpu, plan, True) res = data1_gpu.get().real print data1_gpu.ptr, data2_gpu.ptr, plan self.ctx.synchronize() # self.ctx.detach() self.ctx.pop() if __name__=='__main__': shape = (2001,1001) data1 = numpy.random.random(shape) data2 = numpy.random.random(shape) cc = CudaCorrelate(shape) cc.init() print("should be Working") for i in range(50): cc.correlate(data1,data2) print("Memory leaks") for i in range(50): t=threading.Thread(target=cc.correlate,args=(data1,data2)) t.start()
_______________________________________________ PyCUDA mailing list [email protected] http://lists.tiker.net/listinfo/pycuda
