Hola AFE´s, GLO´s ELUG´s
Los dos programas de abajo Fortran y C
calculan el cuadrado de dos números complejos.
De forma normal con un procesador en la computadora
y con la tarjeta video nVidia cuda de hasta 512 procesadores.
compilar como:
r...@pc-joel:~/NVIDIA_GPU_Computing_SDK/C/src/Fortran_Cuda#
/usr/local/cuda/bin/nvcc -c -O3 Cuda_function.cu
r...@pc-joel:~/NVIDIA_GPU_Computing_SDK/C/src/Fortran_Cuda# gfortran -o
complex_mul main.f90 Cuda_function.o -L/usr/local/cuda/lib -lcudart
para ejecutar:
r...@pc-joel:~/NVIDIA_GPU_Computing_SDK/C/src/Fortran_Cuda# ./complex_mul
Results from Fortran
1 ( 1.0000000 , 2.0000000 ) ( -3.0000000 ,
4.0000000 )
2 ( 2.0000000 , 4.0000000 ) ( -12.000000 ,
16.000000 )
3 ( 3.0000000 , 6.0000000 ) ( -27.000000 ,
36.000000 )
4 ( 4.0000000 , 8.0000000 ) ( -48.000000 ,
64.000000 )
5 ( 5.0000000 , 10.000000 ) ( -75.000000 ,
100.00000 )
6 ( 6.0000000 , 12.000000 ) ( -108.00000 ,
144.00000 )
7 ( 7.0000000 , 14.000000 ) ( -147.00000 ,
196.00000 )
8 ( 8.0000000 , 16.000000 ) ( -192.00000 ,
256.00000 )
Results from CUDA
1 ( 1.0000000 , 2.0000000 ) ( -3.0000000 ,
4.0000000 )
2 ( 2.0000000 , 4.0000000 ) ( -12.000000 ,
16.000000 )
3 ( 3.0000000 , 6.0000000 ) ( -27.000000 ,
36.000000 )
4 ( 4.0000000 , 8.0000000 ) ( -48.000000 ,
64.000000 )
5 ( 5.0000000 , 10.000000 ) ( -75.000000 ,
100.00000 )
6 ( 6.0000000 , 12.000000 ) ( -108.00000 ,
144.00000 )
7 ( 7.0000000 , 14.000000 ) ( -147.00000 ,
196.00000 )
8 ( 8.0000000 , 16.000000 ) ( -192.00000 ,
256.00000 )
r...@pc-joel:~/NVIDIA_GPU_Computing_SDK/C/src/Fortran_Cuda#
main.f90 (programa en Fortran)
-------------------------------------------------------------------------------------
program main
implicit none
!define the floating point kind to be single precision
integer, parameter :: fp_kind = kind(0.0)
!define length of the array
integer, parameter :: N=8
complex(fp_kind), dimension(N) :: c, c2
integer :: i
! Initialize array c, compute c2=c*c
do i = 1, N
c(i) = cmplx(i,2*i)
c2(i)= c(i)*c(i)
end do
! Print results from Fortran
print *, "Results from Fortran"
do i = 1, N
print *,i, c(i),c2(i)
end do
! Put
c2=cmplx(0.,0.)
! Do the same computation with CUDA.
! Fortran -> C -> CUDA ->C ->Fortran
call cudafunction(c,c2,N)
!Results from CUDA
print *, "Results from CUDA"
do i = 1, N
print *,i, c(i),c2(i)
end do
end program main
Cuda_function.cu (programa en C)
-------------------------------------------------------------------------------------------------
#include <stdio.h>
#include <cuComplex.h>
#include "cuda.h"
/* Define complex multiply operation */
__device__ cuComplex ComplexMul(cuComplex a, cuComplex b)
{
cuComplex c;
c.x = a.x * b.x - a.y * b.y;
c.y = a.x * b.y + a.y * b.x;
return c;
}
/* Define CUDA kernel that squares the input complex array */
__global__ void square_complex(cuComplex *in, cuComplex *out, int N)
{
unsigned int index = blockIdx.x*blockDim.x+threadIdx.x;
if( index<N )
{
out[index] = ComplexMul(in[index], in[index]);
}
}
/*
Fortran subroutine arguments are passed by references.
call fun( array_a, array_b, N) will be mapped to
function (*a, *b, *N);
*/
extern "C" void cudafunction_(cuComplex *a, cuComplex *b, int *Np)
{
int block_size=4;
cuComplex *a_d;
int N=*Np;
/* Allocate complex array on device */
cudaMalloc ((void **) &a_d , sizeof(cuComplex)*N);
/* Copy array from host memory to device memory */
cudaMemcpy( a_d, a, sizeof(cuComplex)*N ,cudaMemcpyHostToDevice);
/* Compute execution configuration */
dim3 dimBlock(block_size);
dim3 dimGrid (N/dimBlock.x);
if( N % block_size != 0 ) dimGrid.x+=1;
/* Execute the kernel */
square_complex<<<dimGrid,dimBlock>>>(a_d,a_d,N);
/* Copy the result back */
cudaMemcpy( b, a_d, sizeof(cuComplex)*N,cudaMemcpyDeviceToHost);
/* Free memory on the device */
cudaFree(a_d);
return;
}
saludos,
:)
Joel Rodríguez