UNIX


fortest.f
 
PROGRAM fortest
 
! simple program which creates 2 vectors and adds them in a 
! cuda function
 
IMPLICIT NONE
 
integer*4 :: i
integer*4, parameter :: N=8
real*4, Dimension(N) :: a, b
 
DO i=1,N
  a(i)=i*1.0
  b(i)=2.0
END DO
 
 print *, 'a = ', (a(i), i=1,N)
 
  CALL kernel_wrapper(a, b, N)
 
 print *, 'a + 2 = ', (a(i), i=1,N)
 
END PROGRAM 
cudatest.cu
 
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <cuda.h>
#include <cuda_runtime.h>
 
 
// simple kernel function that adds two vectors
__global__ void vect_add(float *a, float *b, int N)
{
   int idx = threadIdx.x;
   if (idx&ltN) a[idx] = a[idx] + b[idx];
}
 
// function called from main fortran program
extern "C" void kernel_wrapper_(float *a, float *b, int *Np)
{
   float  *a_d, *b_d;  // declare GPU vector copies
 
   int blocks = 1;     // uses 1 block of
   int N = *Np;        // N threads on GPU
 
   // Allocate memory on GPU
   cudaMalloc( (void **)&a_d, sizeof(float) * N );
   cudaMalloc( (void **)&b_d, sizeof(float) * N );
 
   // copy vectors from CPU to GPU
   cudaMemcpy( a_d, a, sizeof(float) * N, cudaMemcpyHostToDevice );
   cudaMemcpy( b_d, b, sizeof(float) * N, cudaMemcpyHostToDevice );
 
   // call function on GPU
   vect_add<<< blocks, N >>>( a_d, b_d, N);
 
   // copy vectors back from GPU to CPU
   cudaMemcpy( a, a_d, sizeof(float) * N, cudaMemcpyDeviceToHost );
   cudaMemcpy( b, b_d, sizeof(float) * N, cudaMemcpyDeviceToHost );
 
   // free GPU memory
   cudaFree(a_d);
   cudaFree(a_d);
   return;
}

Makefile

Test: fortest.f cudatest.o

      gfortran -L /usr/local/cuda/lib -I /usr/local/cuda/include -lcudart 
	         -lcuda fortest.f cudatest.o

cudatest.o: cudatest.cu

      nvcc -c -O3 cudatest.cu

clean:

      rm a.out cudatest.o cudatest.linkinfo