#include <mpi.h>
#include <stdio.h>
#include <math.h>

#define THREADS 512
#define MAX_BLOCKS 64

// GPU kernel, we know: THREADS == blockDim.x
__global__ void integrate(int *n, int *blocks, int *myid, 
			  int*numprocs, double *gsum) {
  const unsigned int bid = blockDim.x * blockIdx.x + threadIdx.x;
  const unsigned int gid = *blocks * blockDim.x * *myid + bid;
  const unsigned int tid = threadIdx.x;
  double h, x, sum;
  int i;
  __shared__ double ssum[THREADS];

  sum = 0.0;
  h   = 1.0 / (double) *n;
  for (i = gid + 1; i <= *n; i += blockDim.x * *blocks * *numprocs) {
    x = h * ((double)i - 0.5);
    sum += 4.0 / (1.0 + x*x);
  }
  ssum[tid] = sum * h;
  // block reduction
  __syncthreads();
  for (i = blockDim.x / 2; i > 0; i >>= 1) { /* per block */
    if (tid < i)
      ssum[tid] += ssum[tid + i];
    __syncthreads();
  }
  gsum[bid] = ssum[tid];
}

// number of threads must be a power of 2
__global__ static void global_reduce(int *n, int *blocks, double *gsum)
{
    __shared__ double ssum[THREADS];
    const unsigned int tid = threadIdx.x;
    unsigned int i;

    if (tid < *blocks)
      ssum[tid] = gsum[tid * THREADS];
    else
      ssum[tid] = 0.0;
    __syncthreads();
    for (i = blockDim.x / 2; i > 0; i >>= 1) { /* per block */
        if (tid < i)
           ssum[tid] += ssum[tid + i];
        __syncthreads();
    }
    gsum[tid] = ssum[tid];
}

int main(int argc, char *argv[]) {
  int n, blocks, myid, numprocs;
  int *n_d, *blocks_d, *myid_d, *numprocs_d; // device copy
  double PI25DT = 3.141592653589793238462643;
  double mypi, pi;
  double *mypi_d; // device copy of pi
  double startwtime, endwtime;
  int  namelen;
  char processor_name[MPI_MAX_PROCESSOR_NAME];
  
  MPI_Init(&argc,&argv);
  MPI_Comm_size(MPI_COMM_WORLD,&numprocs);
  MPI_Comm_rank(MPI_COMM_WORLD,&myid);
  MPI_Get_processor_name(processor_name,&namelen);

  // Allocate memory on GPU
  cudaMalloc( (void **) &n_d, sizeof(int) * 1 );
  cudaMalloc( (void **) &blocks_d, sizeof(int) * 1 );
  cudaMalloc( (void **) &numprocs_d, sizeof(int) * 1 );
  cudaMalloc( (void **) &myid_d, sizeof(int) * 1 );
  cudaMalloc( (void **) &mypi_d, sizeof(double) * THREADS * MAX_BLOCKS );

  printf("MPI Task %2d on %20s\n", myid, processor_name);
  sleep(1); // wait for everyone to print
  while (1) {
    if (myid == 0) {
      printf("Enter the number of intervals: (0 quits) ");fflush(stdout);
      scanf("%d",&n);
      printf("Enter the number of blocks: (<=%d) ", MAX_BLOCKS);fflush(stdout);
      scanf("%d",&blocks);

      startwtime = MPI_Wtime();
    }
    MPI_Barrier(MPI_COMM_WORLD);
    MPI_Bcast(&n, 1, MPI_INT, 0, MPI_COMM_WORLD);
    MPI_Bcast(&blocks, 1, MPI_INT, 0, MPI_COMM_WORLD);
    if (n == 0 || blocks > MAX_BLOCKS)
      break;

    // copy from CPU to GPU
    cudaMemcpy( n_d, &n, sizeof(int) * 1, cudaMemcpyHostToDevice );
    cudaMemcpy( blocks_d, &blocks, sizeof(int) * 1, cudaMemcpyHostToDevice );
    cudaMemcpy( numprocs_d, &numprocs, sizeof(int) * 1, cudaMemcpyHostToDevice );
    cudaMemcpy( myid_d, &myid, sizeof(int) * 1, cudaMemcpyHostToDevice );

    integrate<<< blocks, THREADS >>>(n_d, blocks_d, myid_d, numprocs_d, mypi_d);
    if (blocks > 1)
      global_reduce<<< 1, blocks >>>(n_d, blocks_d, mypi_d);

    // copy back from GPU to CPU
    cudaMemcpy( &mypi, mypi_d, sizeof(double) * 1, cudaMemcpyDeviceToHost );

    MPI_Reduce(&mypi, &pi, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);

    if (myid == 0) {
      endwtime = MPI_Wtime();
      printf("pi is approximately %.16f, Error is %.16f\n",
	     pi, fabs(pi - PI25DT));
      printf("wall clock time = %.6f\n",
	     endwtime-startwtime);	       
    }
  }

  // free GPU memory
  cudaFree(n_d);
  cudaFree(blocks_d);
  cudaFree(mypi_d);

  MPI_Finalize();

  return 0;
}