**ТРЕБУЕТСЯ ПРОВЕРКА!!!** kernel.cu #include __global__ void kernel(int *array1, int *array2, int *array3) { int index = blockIdx.x * blockDim.x + threadIdx.x; array3[index] = array1[index] + array2[index]; } extern "C" void run_kernel() { int i, array1[6], array2[6], array3[6], *devarray1, *devarray2, *devarray3; for(i = 0; i < 6; i++) { array1[i] = i; array2[i] = 3-i; } cudaMalloc((void**) &devarray1, sizeof(int)*6); cudaMalloc((void**) &devarray2, sizeof(int)*6); cudaMalloc((void**) &devarray3, sizeof(int)*6); cudaMemcpy(devarray1, array1, sizeof(int)*6, cudaMemcpyHostToDevice); cudaMemcpy(devarray2, array2, sizeof(int)*6, cudaMemcpyHostToDevice); kernel<<<2, 3>>>(devarray1, devarray2, devarray3); cudaMemcpy(array3, devarray3, sizeof(int)*6, cudaMemcpyDeviceToHost); for(i = 0; i < 6; i++) { printf("%d ", array3[i]); } printf("\n"); cudaFree(devarray1); cudaFree(devarray2); cudaFree(devarray3); } mpi.c #include void run_kernel(); int main(int argc, char *argv[]) { int rank, size; MPI_Init (&argc, &argv); /* starts MPI */ MPI_Comm_rank (MPI_COMM_WORLD, &rank); /* get current process id */ MPI_Comm_size (MPI_COMM_WORLD, &size); /* get number of processes */ run_kernel(); MPI_Finalize(); return 0; } shell nvcc -c kernel.cu mpicc -o mpicuda mpi.c kernel.o -lcudart -L /usr/local/cuda/lib -I /usr/local/cuda/include mpirun -l -np 10 ./mpicuda 1: 3 3 3 3 3 3 9: 3 3 3 3 3 3 8: 3 3 3 3 3 3 2: 3 3 3 3 3 3 7: 3 3 3 3 3 3 6: 3 3 3 3 3 3 0: 3 3 3 3 3 3 4: 3 3 3 3 3 3 5: 3 3 3 3 3 3 3: 3 3 3 3 3 3