Hola AFE´s, GLO´s ELUG´s

Los dos programas de abajo Fortran y C
calculan el cuadrado de dos números complejos.
De forma normal con un procesador en la computadora
y con la tarjeta video nVidia cuda de hasta 512 procesadores.

compilar como:


r...@pc-joel:~/NVIDIA_GPU_Computing_SDK/C/src/Fortran_Cuda# /usr/local/cuda/bin/nvcc -c -O3 Cuda_function.cu r...@pc-joel:~/NVIDIA_GPU_Computing_SDK/C/src/Fortran_Cuda# gfortran -o complex_mul main.f90 Cuda_function.o -L/usr/local/cuda/lib -lcudart


para ejecutar:

r...@pc-joel:~/NVIDIA_GPU_Computing_SDK/C/src/Fortran_Cuda# ./complex_mul
 Results from Fortran
1 ( 1.0000000 , 2.0000000 ) ( -3.0000000 , 4.0000000 ) 2 ( 2.0000000 , 4.0000000 ) ( -12.000000 , 16.000000 ) 3 ( 3.0000000 , 6.0000000 ) ( -27.000000 , 36.000000 ) 4 ( 4.0000000 , 8.0000000 ) ( -48.000000 , 64.000000 ) 5 ( 5.0000000 , 10.000000 ) ( -75.000000 , 100.00000 ) 6 ( 6.0000000 , 12.000000 ) ( -108.00000 , 144.00000 ) 7 ( 7.0000000 , 14.000000 ) ( -147.00000 , 196.00000 ) 8 ( 8.0000000 , 16.000000 ) ( -192.00000 , 256.00000 )
 Results from CUDA
1 ( 1.0000000 , 2.0000000 ) ( -3.0000000 , 4.0000000 ) 2 ( 2.0000000 , 4.0000000 ) ( -12.000000 , 16.000000 ) 3 ( 3.0000000 , 6.0000000 ) ( -27.000000 , 36.000000 ) 4 ( 4.0000000 , 8.0000000 ) ( -48.000000 , 64.000000 ) 5 ( 5.0000000 , 10.000000 ) ( -75.000000 , 100.00000 ) 6 ( 6.0000000 , 12.000000 ) ( -108.00000 , 144.00000 ) 7 ( 7.0000000 , 14.000000 ) ( -147.00000 , 196.00000 ) 8 ( 8.0000000 , 16.000000 ) ( -192.00000 , 256.00000 )
r...@pc-joel:~/NVIDIA_GPU_Computing_SDK/C/src/Fortran_Cuda#



main.f90  (programa en Fortran)
-------------------------------------------------------------------------------------
program main

implicit none

!define the floating point kind to be single precision
integer, parameter :: fp_kind = kind(0.0)

!define length of the array
integer, parameter :: N=8

complex(fp_kind), dimension(N) :: c, c2
integer :: i

! Initialize array c, compute c2=c*c
do i = 1, N
 c(i) = cmplx(i,2*i)
 c2(i)= c(i)*c(i)
end do

! Print results from Fortran
print *, "Results from Fortran"
do i = 1, N
 print *,i, c(i),c2(i)
end do

! Put
c2=cmplx(0.,0.)

! Do the same computation with CUDA.
! Fortran -> C -> CUDA ->C ->Fortran
call cudafunction(c,c2,N)

!Results from CUDA
print *, "Results from CUDA"
do i = 1, N
 print *,i, c(i),c2(i)
end do

end program main



Cuda_function.cu  (programa en C)
-------------------------------------------------------------------------------------------------
#include <stdio.h>
#include <cuComplex.h>
#include "cuda.h"


/* Define complex multiply operation */
__device__ cuComplex ComplexMul(cuComplex a, cuComplex b)
{
    cuComplex c;
    c.x = a.x * b.x - a.y * b.y;
    c.y = a.x * b.y + a.y * b.x;
    return c;

}

/* Define CUDA kernel that squares the input complex array */
__global__ void  square_complex(cuComplex *in, cuComplex *out, int N)
{
 unsigned int index   = blockIdx.x*blockDim.x+threadIdx.x;
 if( index<N )
  {
   out[index] = ComplexMul(in[index], in[index]);
  }

}


/*
   Fortran subroutine arguments are passed by references.
   call fun( array_a, array_b, N) will be mapped to
   function (*a, *b, *N);
*/

extern "C" void cudafunction_(cuComplex *a, cuComplex *b,  int *Np)
{
  int block_size=4;
  cuComplex *a_d;
  int N=*Np;

  /* Allocate complex array on device */
  cudaMalloc ((void **) &a_d , sizeof(cuComplex)*N);

  /* Copy array from host memory to device memory */
  cudaMemcpy( a_d, a,  sizeof(cuComplex)*N   ,cudaMemcpyHostToDevice);

  /* Compute execution configuration */
   dim3 dimBlock(block_size);
   dim3 dimGrid (N/dimBlock.x);
   if( N % block_size != 0 ) dimGrid.x+=1;

  /* Execute the kernel */
  square_complex<<<dimGrid,dimBlock>>>(a_d,a_d,N);

  /* Copy the result back */
   cudaMemcpy( b, a_d, sizeof(cuComplex)*N,cudaMemcpyDeviceToHost);

  /* Free memory on the device */
  cudaFree(a_d);

  return;
}






saludos,
:)

Joel Rodríguez

Responder a