#include <stdio.h>
#include <math.h>
#include "mytime.h"

#define THREADS 512
#define MAX_BLOCKS 64

// GPU kernel, we know: THREADS == blockDim.x
__global__ void integrate(int *n, int *blocks, double *gsum) {
  double h, x;
  int i;
  __shared__ double sum[THREADS];

  sum[threadIdx.x] = 0.0;
  h   = 1.0 / (double) *n;
  for (i = blockIdx.x*blockDim.x + threadIdx.x + 1; i <= *n; i += blockDim.x * *blocks) {
    x = h * ((double)i - 0.5);
    sum[threadIdx.x] += 4.0 / (1.0 + x*x);
  }
  gsum[blockIdx.x*blockDim.x + threadIdx.x] = sum[threadIdx.x] * h;
}

int main(int argc, char *argv[]) {
  int n, i, blocks;
  int *n_d, *blocks_d; // device copy
  double PI25DT = 3.141592653589793238462643;
  double pi;
  double mypi[THREADS*MAX_BLOCKS];
  double *mypi_d; // device copy of pi
  struct timeval startwtime, endwtime, diffwtime;
  
  // Allocate memory on GPU
  cudaMalloc( (void **) &n_d, sizeof(int) * 1 );
  cudaMalloc( (void **) &blocks_d, sizeof(int) * 1 );
  cudaMalloc( (void **) &mypi_d, sizeof(double) * THREADS * MAX_BLOCKS );

  while (1) {
    printf("Enter the number of intervals: (0 quits) ");fflush(stdout);
    scanf("%d",&n);
    printf("Enter the number of blocks: (<=%d) ", MAX_BLOCKS);fflush(stdout);
    scanf("%d",&blocks);

    gettimeofday(&startwtime, NULL);
    if (n == 0 || blocks > MAX_BLOCKS)
      break;

    // copy from CPU to GPU
    cudaMemcpy( n_d, &n, sizeof(int) * 1, cudaMemcpyHostToDevice );
    cudaMemcpy( blocks_d, &blocks, sizeof(int) * 1, cudaMemcpyHostToDevice );

    integrate<<< blocks, THREADS >>>(n_d, blocks_d, mypi_d);
    
    // copy back from GPU to CPU
    cudaMemcpy( &mypi, mypi_d, sizeof(double) * THREADS * blocks, cudaMemcpyDeviceToHost );

    pi = 0.0;
    for (i = 0; i < THREADS * blocks; i++)
      pi += mypi[i];

    gettimeofday(&endwtime, NULL);
    MINUS_UTIME(diffwtime, endwtime, startwtime);
    printf("pi is approximately %.16f, Error is %.16f\n",
	   pi, fabs(pi - PI25DT));
    printf("wall clock time = %d.%06d\n",
	   diffwtime.tv_sec, diffwtime.tv_usec);
  }

  // free GPU memory
  cudaFree(n_d);
  cudaFree(blocks_d);
  cudaFree(mypi_d);

  return 0;
}