**UNIX**
----
fortest.f
PROGRAM fortest
! simple program which creates 2 vectors and adds them in a
! cuda function
IMPLICIT NONE
integer*4 :: i
integer*4, parameter :: N=8
real*4, Dimension(N) :: a, b
DO i=1,N
a(i)=i*1.0
b(i)=2.0
END DO
print *, 'a = ', (a(i), i=1,N)
CALL kernel_wrapper(a, b, N)
print *, 'a + 2 = ', (a(i), i=1,N)
END PROGRAM
cudatest.cu
#include
#include
#include
#include
#include
// simple kernel function that adds two vectors
__global__ void vect_add(float *a, float *b, int N)
{
int idx = threadIdx.x;
if (idx<N) a[idx] = a[idx] + b[idx];
}
// function called from main fortran program
extern "C" void kernel_wrapper_(float *a, float *b, int *Np)
{
float *a_d, *b_d; // declare GPU vector copies
int blocks = 1; // uses 1 block of
int N = *Np; // N threads on GPU
// Allocate memory on GPU
cudaMalloc( (void **)&a_d, sizeof(float) * N );
cudaMalloc( (void **)&b_d, sizeof(float) * N );
// copy vectors from CPU to GPU
cudaMemcpy( a_d, a, sizeof(float) * N, cudaMemcpyHostToDevice );
cudaMemcpy( b_d, b, sizeof(float) * N, cudaMemcpyHostToDevice );
// call function on GPU
vect_add<<< blocks, N >>>( a_d, b_d, N);
// copy vectors back from GPU to CPU
cudaMemcpy( a, a_d, sizeof(float) * N, cudaMemcpyDeviceToHost );
cudaMemcpy( b, b_d, sizeof(float) * N, cudaMemcpyDeviceToHost );
// free GPU memory
cudaFree(a_d);
cudaFree(a_d);
return;
}
Makefile
Test: fortest.f cudatest.o
gfortran -L /usr/local/cuda/lib -I /usr/local/cuda/include -lcudart
-lcuda fortest.f cudatest.o
cudatest.o: cudatest.cu
nvcc -c -O3 cudatest.cu
clean:
rm a.out cudatest.o cudatest.linkinfo