Hello members,
I adapted the SDK diagonal transpose example and I want to transpose a
2776x2080 matrix. However I do not get the right transpose matrix. The
output is:
A
shape: (2776, 2080)
[[ 0 0 0 ..., 0 0 0]
[ 1 1 1 ..., 1 1 1]
[ 2 2 2 ..., 2 2 2]
...,
[2773 2773 2773 ..., 2773 2773 2773]
[2774 2774 2774 ..., 2774 2774 2774]
[2775 2775 2775 ..., 2775 2775 2775]]
--------------------------------------------------------------------------------
A.T
shape: (2080, 2776)
[[ 0 1 2 ..., 924 926 927]
[ 928 930 931 ..., 1853 1855 1856]
[1857 1859 1860 ..., 6 8 9]
...,
[2766 2767 2769 ..., 915 916 918]
[ 919 920 922 ..., 1844 1845 1847]
[1848 1849 1851 ..., 2773 2774 2775]]
--------------------------------------------------------------------------------
Did I miss something? What is wrong in my code?
If I transpose 2776x2776 or 2080x2080 matrices everything is fine.
Best regards,
Giuseppe
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
Forschungszentrum Juelich GmbH
52425 Juelich
Sitz der Gesellschaft: Juelich
Eingetragen im Handelsregister des Amtsgerichts Dueren Nr. HR B 3498
Vorsitzender des Aufsichtsrats: MinDir Dr. Karl Eugen Huthmacher
Geschaeftsfuehrung: Prof. Dr. Achim Bachem (Vorsitzender),
Karsten Beneke (stellv. Vorsitzender), Prof. Dr.-Ing. Harald Bolt,
Prof. Dr. Sebastian M. Schmidt
------------------------------------------------------------------------------------------------
------------------------------------------------------------------------------------------------
from __future__ import division
from pycuda.compiler import SourceModule
import numpy
import pdb
import pycuda.autoinit
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
def _get_transpose_kernel():
mod = SourceModule("""
#define TILE_DIM 8
#define BLOCK_ROWS 1
__global__ void transpose(unsigned short *A_t, unsigned short *A, int a_width, int a_height)
{
__shared__ unsigned short block[TILE_DIM][TILE_DIM+1];
int blockIdx_x, blockIdx_y;
if(a_width == a_height) {
blockIdx_y = blockIdx.x;
blockIdx_x = (blockIdx.x + blockIdx.y) % gridDim.x;
} else {
int bid = blockIdx.x + gridDim.x*blockIdx.y;
blockIdx_y = bid % gridDim.y;
blockIdx_x = ((bid / gridDim.y) + blockIdx_y) % gridDim.x;
}
int xIndex = blockIdx_x * TILE_DIM + threadIdx.x;
int yIndex = blockIdx_y * TILE_DIM + threadIdx.y;
int index_in = xIndex + a_width * yIndex;
xIndex = blockIdx_y * TILE_DIM + threadIdx.x;
yIndex = blockIdx_x * TILE_DIM + threadIdx.y;
int index_out = xIndex + a_height * yIndex;
for (int i=0; i < TILE_DIM; i += BLOCK_ROWS)
{
block[threadIdx.y + i][threadIdx.x] = A[index_in + i * a_width];
}
__syncthreads();
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
{
A_t[index_out + i * a_height] = block[threadIdx.x][threadIdx.y + i];
}
}
""")
func = mod.get_function("transpose")
return func
def transpose(src):
w, h = src.shape
result = gpuarray.empty((h, w), dtype=src.dtype, order='C')
blocks = (8, 1, 1)
gridx = int(w / 8)
gridy = int(h / 8)
transpose_matrix = _get_transpose_kernel()
transpose_matrix(result.gpudata, src.gpudata, numpy.int32(w), numpy.int32(h), block=blocks, grid=(gridx, gridy))
return result
if __name__ == '__main__':
i_data = numpy.repeat(numpy.arange(0,2776), 2080).reshape(2776,-1)
i_data = numpy.array(i_data, dtype= numpy.uint16, order='C')
input = gpuarray.to_gpu(i_data)
result = transpose(input)
print "A"
print "shape: ", i_data.shape
print i_data
print "-"*80
print "A.T"
print "shape: ", result.shape
print result
print "-"*80
_______________________________________________
PyCUDA mailing list
[email protected]
http://lists.tiker.net/listinfo/pycuda