#include #include #include "mytime.h" #define THREADS 512 #define MAX_BLOCKS 64 // GPU kernel, we know: THREADS == blockDim.x __global__ void integrate(int *n, int *blocks, double *gsum) { double h, x; int i; __shared__ double sum[THREADS]; sum[threadIdx.x] = 0.0; h = 1.0 / (double) *n; for (i = blockIdx.x*blockDim.x + threadIdx.x + 1; i <= *n; i += blockDim.x * *blocks) { x = h * ((double)i - 0.5); sum[threadIdx.x] += 4.0 / (1.0 + x*x); } gsum[blockIdx.x*blockDim.x + threadIdx.x] = sum[threadIdx.x] * h; } int main(int argc, char *argv[]) { int n, i, blocks; int *n_d, *blocks_d; // device copy double PI25DT = 3.141592653589793238462643; double pi; double mypi[THREADS*MAX_BLOCKS]; double *mypi_d; // device copy of pi struct timeval startwtime, endwtime, diffwtime; // Allocate memory on GPU cudaMalloc( (void **) &n_d, sizeof(int) * 1 ); cudaMalloc( (void **) &blocks_d, sizeof(int) * 1 ); cudaMalloc( (void **) &mypi_d, sizeof(double) * THREADS * MAX_BLOCKS ); while (1) { printf("Enter the number of intervals: (0 quits) ");fflush(stdout); scanf("%d",&n); printf("Enter the number of blocks: (<=%d) ", MAX_BLOCKS);fflush(stdout); scanf("%d",&blocks); gettimeofday(&startwtime, NULL); if (n == 0 || blocks > MAX_BLOCKS) break; // copy from CPU to GPU cudaMemcpy( n_d, &n, sizeof(int) * 1, cudaMemcpyHostToDevice ); cudaMemcpy( blocks_d, &blocks, sizeof(int) * 1, cudaMemcpyHostToDevice ); integrate<<< blocks, THREADS >>>(n_d, blocks_d, mypi_d); // copy back from GPU to CPU cudaMemcpy( &mypi, mypi_d, sizeof(double) * THREADS * blocks, cudaMemcpyDeviceToHost ); pi = 0.0; for (i = 0; i < THREADS * blocks; i++) pi += mypi[i]; gettimeofday(&endwtime, NULL); MINUS_UTIME(diffwtime, endwtime, startwtime); printf("pi is approximately %.16f, Error is %.16f\n", pi, fabs(pi - PI25DT)); printf("wall clock time = %d.%06d\n", diffwtime.tv_sec, diffwtime.tv_usec); } // free GPU memory cudaFree(n_d); cudaFree(blocks_d); cudaFree(mypi_d); return 0; }