# -*- coding: utf-8 -*-
from pylab import *
import time
import pycuda.autoinit
import pycuda.driver as cu
import pycuda.compiler as nvcc
import pycuda.gpuarray
from pycuda import gpuarray

transpose_kernel_source = \
"""
#define TILE_DIM 32
#define BLOCK_ROWS 32

__global__ void transpose_kernel(float *odata, float* idata, int width, int height)
{
  int xIndex = blockIdx.x * TILE_DIM + threadIdx.x;
  int yIndex = blockIdx.y * TILE_DIM + threadIdx.y;

  int index_in  = xIndex + width * yIndex;
  int index_out = yIndex + height * xIndex;

    for (int i=0; i<TILE_DIM; i+=BLOCK_ROWS) {
      odata[index_out+i] = idata[index_in+i*width];
    }
  
}

"""

# Compile the CUDA Kernel at runtime and cahche the result for later reuse
transpose_kernel_source_module = nvcc.SourceModule( transpose_kernel_source )
# Get a handle to the compiled CUDA kernel
transpose_kernel = transpose_kernel_source_module.get_function( "transpose_kernel" )

# Read image

a=rand(8,2).astype(float32)

c=a.transpose()
b_zeros=zeros(c.shape).astype(float32)
a_device=gpuarray.to_gpu(a)

b_device=gpuarray.to_gpu(b_zeros)

print c.shape
print c.dtype




# Get image data
heightA    = int32( a.shape[0] )
widthA      = int32( a.shape[1] )


tile_dimx=int32(2)
tile_dimy=int32(2)




n_TPB     = 2
n_blocks  = 2     # Number of thread blocks
         # Number of threads per block

transpose_kernel(b_device,a_device,widthA,heightA,grid=(int(widthA/tile_dimx),int(heightA/tile_dimy)),block=(int(tile_dimx),int(tile_dimy),1))
c_result_gpu=b_device.get()

print c_result_gpu
print c