**UNIX** ---- fortest.f PROGRAM fortest ! simple program which creates 2 vectors and adds them in a ! cuda function IMPLICIT NONE integer*4 :: i integer*4, parameter :: N=8 real*4, Dimension(N) :: a, b DO i=1,N a(i)=i*1.0 b(i)=2.0 END DO print *, 'a = ', (a(i), i=1,N) CALL kernel_wrapper(a, b, N) print *, 'a + 2 = ', (a(i), i=1,N) END PROGRAM cudatest.cu #include #include #include #include #include // simple kernel function that adds two vectors __global__ void vect_add(float *a, float *b, int N) { int idx = threadIdx.x; if (idx<N) a[idx] = a[idx] + b[idx]; } // function called from main fortran program extern "C" void kernel_wrapper_(float *a, float *b, int *Np) { float *a_d, *b_d; // declare GPU vector copies int blocks = 1; // uses 1 block of int N = *Np; // N threads on GPU // Allocate memory on GPU cudaMalloc( (void **)&a_d, sizeof(float) * N ); cudaMalloc( (void **)&b_d, sizeof(float) * N ); // copy vectors from CPU to GPU cudaMemcpy( a_d, a, sizeof(float) * N, cudaMemcpyHostToDevice ); cudaMemcpy( b_d, b, sizeof(float) * N, cudaMemcpyHostToDevice ); // call function on GPU vect_add<<< blocks, N >>>( a_d, b_d, N); // copy vectors back from GPU to CPU cudaMemcpy( a, a_d, sizeof(float) * N, cudaMemcpyDeviceToHost ); cudaMemcpy( b, b_d, sizeof(float) * N, cudaMemcpyDeviceToHost ); // free GPU memory cudaFree(a_d); cudaFree(a_d); return; } Makefile Test: fortest.f cudatest.o gfortran -L /usr/local/cuda/lib -I /usr/local/cuda/include -lcudart -lcuda fortest.f cudatest.o cudatest.o: cudatest.cu nvcc -c -O3 cudatest.cu clean: rm a.out cudatest.o cudatest.linkinfo