**ТРЕБУЕТСЯ ПРОВЕРКА!!!**
kernel.cu
#include
__global__ void kernel(int *array1, int *array2, int *array3)
{
int index = blockIdx.x * blockDim.x + threadIdx.x;
array3[index] = array1[index] + array2[index];
}
extern "C"
void run_kernel()
{
int i, array1[6], array2[6], array3[6], *devarray1, *devarray2, *devarray3;
for(i = 0; i < 6; i++)
{
array1[i] = i;
array2[i] = 3-i;
}
cudaMalloc((void**) &devarray1, sizeof(int)*6);
cudaMalloc((void**) &devarray2, sizeof(int)*6);
cudaMalloc((void**) &devarray3, sizeof(int)*6);
cudaMemcpy(devarray1, array1, sizeof(int)*6, cudaMemcpyHostToDevice);
cudaMemcpy(devarray2, array2, sizeof(int)*6, cudaMemcpyHostToDevice);
kernel<<<2, 3>>>(devarray1, devarray2, devarray3);
cudaMemcpy(array3, devarray3, sizeof(int)*6, cudaMemcpyDeviceToHost);
for(i = 0; i < 6; i++)
{
printf("%d ", array3[i]);
}
printf("\n");
cudaFree(devarray1);
cudaFree(devarray2);
cudaFree(devarray3);
}
mpi.c
#include
void run_kernel();
int main(int argc, char *argv[])
{
int rank, size;
MPI_Init (&argc, &argv); /* starts MPI */
MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */
MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */
run_kernel();
MPI_Finalize();
return 0;
}
shell
nvcc -c kernel.cu
mpicc -o mpicuda mpi.c kernel.o -lcudart -L /usr/local/cuda/lib -I /usr/local/cuda/include
mpirun -l -np 10 ./mpicuda
1: 3 3 3 3 3 3
9: 3 3 3 3 3 3
8: 3 3 3 3 3 3
2: 3 3 3 3 3 3
7: 3 3 3 3 3 3
6: 3 3 3 3 3 3
0: 3 3 3 3 3 3
4: 3 3 3 3 3 3
5: 3 3 3 3 3 3
3: 3 3 3 3 3 3