// GPU kernel
__global__ void integrate(int *n, double *sum) {
  double h, x;
  int i;
	
  *sum = 0.0;
  h = 1.0 / (double) *n;
  for (i = 1; i <= *n; i++) {
    x = h * ((double)i - 0.5);
    *sum += 4.0 / (1.0 + x*x);
  }
  *sum *= h;
}

// notice the underscore "_" after the function name -- needed by gfortran
extern "C" void fortran_call_integrate_(int *n, double *pi) {
  int *n_d; // device copy of n
  double *pi_d; // device copy of pi

  // Allocate memory on GPU
  cudaMalloc( (void **) &n_d, sizeof(int) * 1 );
  cudaMalloc( (void **) &pi_d, sizeof(double) * 1 );

  // copy from CPU to GPU
  cudaMemcpy( n_d, n, sizeof(int) * 1, cudaMemcpyHostToDevice );
  
  integrate<<< 1, 1 >>>(n_d, pi_d);
  
  // copy back from GPU to CPU
  cudaMemcpy( pi, pi_d, sizeof(double) * 1, cudaMemcpyDeviceToHost );

  // free GPU memory
  cudaFree(n_d);
  cudaFree(pi_d);
}