ТРЕБУЕТСЯ ПРОВЕРКА!!!

kernel.cu

#include <stdio.h>
 
__global__ void kernel(int *array1, int *array2, int *array3)
{
        int index = blockIdx.x * blockDim.x + threadIdx.x;
        array3[index] = array1[index] + array2[index];
}
 
extern "C"
void run_kernel()
{
        int i, array1[6], array2[6], array3[6], *devarray1, *devarray2, *devarray3;
        for(i = 0; i < 6; i++)
        {
                array1[i] = i;
                array2[i] = 3-i;
        }
 
        cudaMalloc((void**) &devarray1, sizeof(int)*6);
        cudaMalloc((void**) &devarray2, sizeof(int)*6);
        cudaMalloc((void**) &devarray3, sizeof(int)*6);
 
        cudaMemcpy(devarray1, array1, sizeof(int)*6, cudaMemcpyHostToDevice);
        cudaMemcpy(devarray2, array2, sizeof(int)*6, cudaMemcpyHostToDevice);
 
        kernel<<<2, 3>>>(devarray1, devarray2, devarray3);
 
        cudaMemcpy(array3, devarray3, sizeof(int)*6, cudaMemcpyDeviceToHost);
 
        for(i = 0; i < 6; i++)
        {
                printf("%d ", array3[i]);
        }
        printf("\n");
 
        cudaFree(devarray1);
        cudaFree(devarray2);
        cudaFree(devarray3);
}

mpi.c

#include <mpi.h>
 
void run_kernel();
 
int main(int argc, char *argv[])
{
        int rank, size;
 
        MPI_Init (&argc, &argv);        /* starts MPI */
        MPI_Comm_rank (MPI_COMM_WORLD, &rank);  /* get current process id */
        MPI_Comm_size (MPI_COMM_WORLD, &size);  /* get number of processes */
        run_kernel();
        MPI_Finalize();
        return 0;
}

shell

nvcc -c kernel.cu
mpicc -o mpicuda mpi.c kernel.o -lcudart -L /usr/local/cuda/lib -I /usr/local/cuda/include
 
 
mpirun -l -np 10 ./mpicuda
1: 3 3 3 3 3 3
9: 3 3 3 3 3 3
8: 3 3 3 3 3 3
2: 3 3 3 3 3 3
7: 3 3 3 3 3 3
6: 3 3 3 3 3 3
0: 3 3 3 3 3 3
4: 3 3 3 3 3 3
5: 3 3 3 3 3 3
3: 3 3 3 3 3 3