#include #include #include #define THREADS 512 #define MAX_BLOCKS 64 // GPU kernel, we know: THREADS == blockDim.x __global__ void integrate(int *n, int *blocks, int *myid, int*numprocs, double *gsum) { const unsigned int bid = blockDim.x * blockIdx.x + threadIdx.x; const unsigned int gid = *blocks * blockDim.x * *myid + bid; const unsigned int tid = threadIdx.x; double h, x, sum; int i; __shared__ double ssum[THREADS]; sum = 0.0; h = 1.0 / (double) *n; for (i = gid + 1; i <= *n; i += blockDim.x * *blocks * *numprocs) { x = h * ((double)i - 0.5); sum += 4.0 / (1.0 + x*x); } ssum[tid] = sum * h; // block reduction __syncthreads(); for (i = blockDim.x / 2; i > 0; i >>= 1) { /* per block */ if (tid < i) ssum[tid] += ssum[tid + i]; __syncthreads(); } gsum[bid] = ssum[tid]; } // number of threads must be a power of 2 __global__ static void global_reduce(int *n, int *blocks, double *gsum) { __shared__ double ssum[THREADS]; const unsigned int tid = threadIdx.x; unsigned int i; if (tid < *blocks) ssum[tid] = gsum[tid * THREADS]; else ssum[tid] = 0.0; __syncthreads(); for (i = blockDim.x / 2; i > 0; i >>= 1) { /* per block */ if (tid < i) ssum[tid] += ssum[tid + i]; __syncthreads(); } gsum[tid] = ssum[tid]; } int main(int argc, char *argv[]) { int n, blocks, myid, numprocs; int *n_d, *blocks_d, *myid_d, *numprocs_d; // device copy double PI25DT = 3.141592653589793238462643; double mypi, pi; double *mypi_d; // device copy of pi double startwtime, endwtime; int namelen; char processor_name[MPI_MAX_PROCESSOR_NAME]; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&numprocs); MPI_Comm_rank(MPI_COMM_WORLD,&myid); MPI_Get_processor_name(processor_name,&namelen); // Allocate memory on GPU cudaMalloc( (void **) &n_d, sizeof(int) * 1 ); cudaMalloc( (void **) &blocks_d, sizeof(int) * 1 ); cudaMalloc( (void **) &numprocs_d, sizeof(int) * 1 ); cudaMalloc( (void **) &myid_d, sizeof(int) * 1 ); cudaMalloc( (void **) &mypi_d, sizeof(double) * THREADS * MAX_BLOCKS ); printf("MPI Task %2d on %20s\n", myid, processor_name); sleep(1); // wait for everyone to print while (1) { if (myid == 0) { printf("Enter the number of intervals: (0 quits) ");fflush(stdout); scanf("%d",&n); printf("Enter the number of blocks: (<=%d) ", MAX_BLOCKS);fflush(stdout); scanf("%d",&blocks); startwtime = MPI_Wtime(); } MPI_Barrier(MPI_COMM_WORLD); MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast(&blocks, 1, MPI_INT, 0, MPI_COMM_WORLD); if (n == 0 || blocks > MAX_BLOCKS) break; // copy from CPU to GPU cudaMemcpy( n_d, &n, sizeof(int) * 1, cudaMemcpyHostToDevice ); cudaMemcpy( blocks_d, &blocks, sizeof(int) * 1, cudaMemcpyHostToDevice ); cudaMemcpy( numprocs_d, &numprocs, sizeof(int) * 1, cudaMemcpyHostToDevice ); cudaMemcpy( myid_d, &myid, sizeof(int) * 1, cudaMemcpyHostToDevice ); integrate<<< blocks, THREADS >>>(n_d, blocks_d, myid_d, numprocs_d, mypi_d); if (blocks > 1) global_reduce<<< 1, blocks >>>(n_d, blocks_d, mypi_d); // copy back from GPU to CPU cudaMemcpy( &mypi, mypi_d, sizeof(double) * 1, cudaMemcpyDeviceToHost ); MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); if (myid == 0) { endwtime = MPI_Wtime(); printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT)); printf("wall clock time = %.6f\n", endwtime-startwtime); } } // free GPU memory cudaFree(n_d); cudaFree(blocks_d); cudaFree(mypi_d); MPI_Finalize(); return 0; }