!-------------------------------------------------------------------------! ! ! ! N A S P A R A L L E L B E N C H M A R K S 3.3 ! ! ! ! M G ! ! ! !-------------------------------------------------------------------------! ! ! ! This benchmark is part of the NAS Parallel Benchmark 3.3 suite. ! ! It is described in NAS Technical Reports 95-020 and 02-007 ! ! ! ! Permission to use, copy, distribute and modify this software ! ! for any purpose with or without fee is hereby granted. We ! ! request, however, that all derived work reference the NAS ! ! Parallel Benchmarks 3.3. This software is provided "as is" ! ! without express or implied warranty. ! ! ! ! Information on NPB 3.3, including the technical report, the ! ! original specifications, source code, results and information ! ! on how to submit new results, is available at: ! ! ! ! http://www.nas.nasa.gov/Software/NPB/ ! ! ! ! Send comments or suggestions to npb@nas.nasa.gov ! ! ! ! NAS Parallel Benchmarks Group ! ! NASA Ames Research Center ! ! Mail Stop: T27A-1 ! ! Moffett Field, CA 94035-1000 ! ! ! ! E-mail: npb@nas.nasa.gov ! ! Fax: (650) 604-3957 ! ! ! !-------------------------------------------------------------------------! c--------------------------------------------------------------------- c c Authors: E. Barszcz c P. Frederickson c A. Woo c M. Yarrow c R. F. Van der Wijngaart c c--------------------------------------------------------------------- c--------------------------------------------------------------------- program mg_mpi c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' c---------------------------------------------------------------------------c c k is the current level. It is passed down through subroutine args c and is NOT global. it is the current iteration c---------------------------------------------------------------------------c integer k, it external timer_read double precision t, t0, tinit, mflops, timer_read c---------------------------------------------------------------------------c c These arrays are in common because they are quite large c and probably shouldn't be allocated on the stack. They c are always passed as subroutine args. c---------------------------------------------------------------------------c double precision u(nr),v(nv),r(nr),a(0:3),c(0:3) common /noautom/ u,v,r double precision rnm2, rnmu, old2, oldu, epsilon integer n1, n2, n3, nit double precision nn, verify_value, err logical verified integer ierr,i, fstatus integer T_bench, T_init parameter (T_bench=1, T_init=2) call mpi_init(ierr) call mpi_comm_rank(mpi_comm_world, me, ierr) call mpi_comm_size(mpi_comm_world, nprocs, ierr) root = 0 if (nprocs_compiled .gt. maxprocs) then if (me .eq. root) write(*,20) nprocs_compiled, maxprocs 20 format(' ERROR: compiled for ',i8,' processes'// & ' The maximum size allowed for this benchmark is ',i6) call mpi_abort(MPI_COMM_WORLD, ierr) stop endif if (.not. convertdouble) then dp_type = MPI_DOUBLE_PRECISION else dp_type = MPI_REAL endif call timer_clear(T_bench) call timer_clear(T_init) call mpi_barrier(MPI_COMM_WORLD, ierr) call timer_start(T_init) c--------------------------------------------------------------------- c Read in and broadcast input data c--------------------------------------------------------------------- if( me .eq. root )then write (*, 1000) open(unit=7,file="mg.input", status="old", iostat=fstatus) if (fstatus .eq. 0) then write(*,50) 50 format(' Reading from input file mg.input') read(7,*) lt read(7,*) nx(lt), ny(lt), nz(lt) read(7,*) nit read(7,*) (debug_vec(i),i=0,7) else write(*,51) 51 format(' No input file. Using compiled defaults ') lt = lt_default nit = nit_default nx(lt) = nx_default ny(lt) = ny_default nz(lt) = nz_default do i = 0,7 debug_vec(i) = debug_default end do endif endif call mpi_bcast(lt, 1, MPI_INTEGER, 0, mpi_comm_world, ierr) call mpi_bcast(nit, 1, MPI_INTEGER, 0, mpi_comm_world, ierr) call mpi_bcast(nx(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr) call mpi_bcast(ny(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr) call mpi_bcast(nz(lt), 1, MPI_INTEGER, 0, mpi_comm_world, ierr) call mpi_bcast(debug_vec(0), 8, MPI_INTEGER, 0, > mpi_comm_world, ierr) if ( (nx(lt) .ne. ny(lt)) .or. (nx(lt) .ne. nz(lt)) ) then Class = 'U' else if( nx(lt) .eq. 32 .and. nit .eq. 4 ) then Class = 'S' else if( nx(lt) .eq. 128 .and. nit .eq. 4 ) then Class = 'W' else if( nx(lt) .eq. 256 .and. nit .eq. 4 ) then Class = 'A' else if( nx(lt) .eq. 256 .and. nit .eq. 20 ) then Class = 'B' else if( nx(lt) .eq. 512 .and. nit .eq. 20 ) then Class = 'C' else if( nx(lt) .eq. 1024 .and. nit .eq. 50 ) then Class = 'D' else if( nx(lt) .eq. 2048 .and. nit .eq. 50 ) then Class = 'E' else Class = 'U' endif c--------------------------------------------------------------------- c Use these for debug info: c--------------------------------------------------------------------- c debug_vec(0) = 1 !=> report all norms c debug_vec(1) = 1 !=> some setup information c debug_vec(1) = 2 !=> more setup information c debug_vec(2) = k => at level k or below, show result of resid c debug_vec(3) = k => at level k or below, show result of psinv c debug_vec(4) = k => at level k or below, show result of rprj c debug_vec(5) = k => at level k or below, show result of interp c debug_vec(6) = 1 => (unused) c debug_vec(7) = 1 => (unused) c--------------------------------------------------------------------- a(0) = -8.0D0/3.0D0 a(1) = 0.0D0 a(2) = 1.0D0/6.0D0 a(3) = 1.0D0/12.0D0 if(Class .eq. 'A' .or. Class .eq. 'S'.or. Class .eq.'W') then c--------------------------------------------------------------------- c Coefficients for the S(a) smoother c--------------------------------------------------------------------- c(0) = -3.0D0/8.0D0 c(1) = +1.0D0/32.0D0 c(2) = -1.0D0/64.0D0 c(3) = 0.0D0 else c--------------------------------------------------------------------- c Coefficients for the S(b) smoother c--------------------------------------------------------------------- c(0) = -3.0D0/17.0D0 c(1) = +1.0D0/33.0D0 c(2) = -1.0D0/61.0D0 c(3) = 0.0D0 endif lb = 1 k = lt call setup(n1,n2,n3,k) call zero3(u,n1,n2,n3) call zran3(v,n1,n2,n3,nx(lt),ny(lt),k) call norm2u3(v,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) if( me .eq. root )then write (*, 1001) nx(lt),ny(lt),nz(lt), Class write (*, 1002) nit 1000 format(//,' NAS Parallel Benchmarks 3.3 -- MG Benchmark', /) 1001 format(' Size: ', i4, 'x', i4, 'x', i4, ' (class ', A, ')' ) 1002 format(' Iterations: ', i4) 1003 format(' Number of processes: ', i6) if (nprocs .ne. nprocs_compiled) then write (*, 1004) nprocs_compiled write (*, 1005) nprocs 1004 format(' WARNING: compiled for ', i6, ' processes ') 1005 format(' Number of active processes: ', i6, /) else write (*, 1003) nprocs endif endif call resid(u,v,r,n1,n2,n3,a,k) call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) old2 = rnm2 oldu = rnmu c--------------------------------------------------------------------- c One iteration for startup c--------------------------------------------------------------------- call mg3P(u,v,r,a,c,n1,n2,n3,k) call resid(u,v,r,n1,n2,n3,a,k) call setup(n1,n2,n3,k) call zero3(u,n1,n2,n3) call zran3(v,n1,n2,n3,nx(lt),ny(lt),k) call timer_stop(T_init) if( me .eq. root )then tinit = timer_read(T_init) write( *,'(/A,F15.3,A/)' ) > ' Initialization time: ',tinit, ' seconds' endif call mpi_barrier(mpi_comm_world,ierr) call timer_start(T_bench) call resid(u,v,r,n1,n2,n3,a,k) call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) old2 = rnm2 oldu = rnmu do it=1,nit if (it.eq.1 .or. it.eq.nit .or. mod(it,5).eq.0) then if (me .eq. root) write(*,80) it 80 format(' iter ',i4) endif call mg3P(u,v,r,a,c,n1,n2,n3,k) call resid(u,v,r,n1,n2,n3,a,k) enddo call norm2u3(r,n1,n2,n3,rnm2,rnmu,nx(lt),ny(lt),nz(lt)) call timer_stop(T_bench) t0 = timer_read(T_bench) call mpi_reduce(t0,t,1,dp_type, > mpi_max,root,mpi_comm_world,ierr) verified = .FALSE. verify_value = 0.0 if( me .eq. root )then write(*,100) 100 format(/' Benchmark completed ') epsilon = 1.d-8 if (Class .ne. 'U') then if(Class.eq.'S') then verify_value = 0.5307707005734d-04 elseif(Class.eq.'W') then verify_value = 0.6467329375339d-05 elseif(Class.eq.'A') then verify_value = 0.2433365309069d-05 elseif(Class.eq.'B') then verify_value = 0.1800564401355d-05 elseif(Class.eq.'C') then verify_value = 0.5706732285740d-06 elseif(Class.eq.'D') then verify_value = 0.1583275060440d-09 elseif(Class.eq.'E') then verify_value = 0.5630442584711d-10 endif err = abs( rnm2 - verify_value ) / verify_value if( err .le. epsilon ) then verified = .TRUE. write(*, 200) write(*, 201) rnm2 write(*, 202) err 200 format(' VERIFICATION SUCCESSFUL ') 201 format(' L2 Norm is ', E20.13) 202 format(' Error is ', E20.13) else verified = .FALSE. write(*, 300) write(*, 301) rnm2 write(*, 302) verify_value 300 format(' VERIFICATION FAILED') 301 format(' L2 Norm is ', E20.13) 302 format(' The correct L2 Norm is ', E20.13) endif else verified = .FALSE. write (*, 400) write (*, 401) write (*, 201) rnm2 400 format(' Problem size unknown') 401 format(' NO VERIFICATION PERFORMED') endif nn = 1.0d0*nx(lt)*ny(lt)*nz(lt) if( t .ne. 0. ) then mflops = 58.*1.0D-6*nit*nn / t else mflops = 0.0 endif call print_results('MG', class, nx(lt), ny(lt), nz(lt), > nit, nprocs_compiled, nprocs, t, > mflops, ' floating point', > verified, npbversion, compiletime, > cs1, cs2, cs3, cs4, cs5, cs6, cs7) endif call mpi_finalize(ierr) end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine setup(n1,n2,n3,k) c--------------------------------------------------------------------- c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer is1, is2, is3, ie1, ie2, ie3 common /grid/ is1,is2,is3,ie1,ie2,ie3 integer n1,n2,n3,k integer dx, dy, log_p, d, i, j integer ax, next(3),mi(3,maxlevel),mip(3,maxlevel) integer ng(3,maxlevel) integer idi(3), pi(3), idin(3,-1:1) integer s, dir,ierr do j=-1,1,1 do d=1,3 msg_type(d,j) = 100*(j+2+10*d) enddo enddo ng(1,lt) = nx(lt) ng(2,lt) = ny(lt) ng(3,lt) = nz(lt) do ax=1,3 next(ax) = 1 do k=lt-1,1,-1 ng(ax,k) = ng(ax,k+1)/2 enddo enddo 61 format(10i4) do k=lt,1,-1 nx(k) = ng(1,k) ny(k) = ng(2,k) nz(k) = ng(3,k) enddo log_p = log(float(nprocs)+0.0001)/log(2.0) dx = log_p/3 pi(1) = 2**dx idi(1) = mod(me,pi(1)) dy = (log_p-dx)/2 pi(2) = 2**dy idi(2) = mod((me/pi(1)),pi(2)) pi(3) = nprocs/(pi(1)*pi(2)) idi(3) = me/(pi(1)*pi(2)) do k = lt,1,-1 dead(k) = .false. do ax = 1,3 take_ex(ax,k) = .false. give_ex(ax,k) = .false. mi(ax,k) = 2 + > ((idi(ax)+1)*ng(ax,k))/pi(ax) - > ((idi(ax)+0)*ng(ax,k))/pi(ax) mip(ax,k) = 2 + > ((next(ax)+idi(ax)+1)*ng(ax,k))/pi(ax) - > ((next(ax)+idi(ax)+0)*ng(ax,k))/pi(ax) if(mip(ax,k).eq.2.or.mi(ax,k).eq.2)then next(ax) = 2*next(ax) endif if( k+1 .le. lt )then if((mip(ax,k).eq.2).and.(mi(ax,k).eq.3))then give_ex(ax,k+1) = .true. endif if((mip(ax,k).eq.3).and.(mi(ax,k).eq.2))then take_ex(ax,k+1) = .true. endif endif enddo if( mi(1,k).eq.2 .or. > mi(2,k).eq.2 .or. > mi(3,k).eq.2 )then dead(k) = .true. endif m1(k) = mi(1,k) m2(k) = mi(2,k) m3(k) = mi(3,k) do ax=1,3 idin(ax,+1) = mod( idi(ax) + next(ax) + pi(ax) , pi(ax) ) idin(ax,-1) = mod( idi(ax) - next(ax) + pi(ax) , pi(ax) ) enddo do dir = 1,-1,-2 nbr(1,dir,k) = idin(1,dir) + pi(1) > *(idi(2) + pi(2) > * idi(3)) nbr(2,dir,k) = idi(1) + pi(1) > *(idin(2,dir) + pi(2) > * idi(3)) nbr(3,dir,k) = idi(1) + pi(1) > *(idi(2) + pi(2) > * idin(3,dir)) enddo enddo k = lt is1 = 2 + ng(1,k) - ((pi(1) -idi(1))*ng(1,lt))/pi(1) ie1 = 1 + ng(1,k) - ((pi(1)-1-idi(1))*ng(1,lt))/pi(1) n1 = 3 + ie1 - is1 is2 = 2 + ng(2,k) - ((pi(2) -idi(2))*ng(2,lt))/pi(2) ie2 = 1 + ng(2,k) - ((pi(2)-1-idi(2))*ng(2,lt))/pi(2) n2 = 3 + ie2 - is2 is3 = 2 + ng(3,k) - ((pi(3) -idi(3))*ng(3,lt))/pi(3) ie3 = 1 + ng(3,k) - ((pi(3)-1-idi(3))*ng(3,lt))/pi(3) n3 = 3 + ie3 - is3 ir(lt)=1 do j = lt-1, 1, -1 ir(j)=ir(j+1)+m1(j+1)*m2(j+1)*m3(j+1) enddo if( debug_vec(1) .ge. 1 )then if( me .eq. root )write(*,*)' in setup, ' if( me .eq. root )write(*,*)' me k lt nx ny nz ', > ' n1 n2 n3 is1 is2 is3 ie1 ie2 ie3' do i=0,nprocs-1 if( me .eq. i )then write(*,9) me,k,lt,ng(1,k),ng(2,k),ng(3,k), > n1,n2,n3,is1,is2,is3,ie1,ie2,ie3 9 format(15i4) endif call mpi_barrier(mpi_comm_world,ierr) enddo endif if( debug_vec(1) .ge. 2 )then do i=0,nprocs-1 if( me .eq. i )then write(*,*)' ' write(*,*)' processor =',me do k=lt,1,-1 write(*,7)k,idi(1),idi(2),idi(3), > ((nbr(d,j,k),j=-1,1,2),d=1,3), > (mi(d,k),d=1,3) enddo 7 format(i4,'idi=',3i4,'nbr=',3(2i4,' '),'mi=',3i4,' ') write(*,*)'idi(s) = ',(idi(s),s=1,3) write(*,*)'dead(2), dead(1) = ',dead(2),dead(1) do ax=1,3 write(*,*)'give_ex(ax,2)= ',give_ex(ax,2) write(*,*)'take_ex(ax,2)= ',take_ex(ax,2) enddo endif call mpi_barrier(mpi_comm_world,ierr) enddo endif k = lt return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine mg3P(u,v,r,a,c,n1,n2,n3,k) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c multigrid V-cycle routine c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer n1, n2, n3, k double precision u(nr),v(nv),r(nr) double precision a(0:3),c(0:3) integer j c--------------------------------------------------------------------- c down cycle. c restrict the residual from the find grid to the coarse c--------------------------------------------------------------------- do k= lt, lb+1 , -1 j = k-1 call rprj3(r(ir(k)),m1(k),m2(k),m3(k), > r(ir(j)),m1(j),m2(j),m3(j),k) enddo k = lb c--------------------------------------------------------------------- c compute an approximate solution on the coarsest grid c--------------------------------------------------------------------- call zero3(u(ir(k)),m1(k),m2(k),m3(k)) call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k) do k = lb+1, lt-1 j = k-1 c--------------------------------------------------------------------- c prolongate from level k-1 to k c--------------------------------------------------------------------- call zero3(u(ir(k)),m1(k),m2(k),m3(k)) call interp(u(ir(j)),m1(j),m2(j),m3(j), > u(ir(k)),m1(k),m2(k),m3(k),k) c--------------------------------------------------------------------- c compute residual for level k c--------------------------------------------------------------------- call resid(u(ir(k)),r(ir(k)),r(ir(k)),m1(k),m2(k),m3(k),a,k) c--------------------------------------------------------------------- c apply smoother c--------------------------------------------------------------------- call psinv(r(ir(k)),u(ir(k)),m1(k),m2(k),m3(k),c,k) enddo 200 continue j = lt - 1 k = lt call interp(u(ir(j)),m1(j),m2(j),m3(j),u,n1,n2,n3,k) call resid(u,v,r,n1,n2,n3,a,k) call psinv(r,u,n1,n2,n3,c,k) return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine psinv( r,u,n1,n2,n3,c,k) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c psinv applies an approximate inverse as smoother: u = u + Cr c c This implementation costs 15A + 4M per result, where c A and M denote the costs of Addition and Multiplication. c Presuming coefficient c(3) is zero (the NPB assumes this, c but it is thus not a general case), 2A + 1M may be eliminated, c resulting in 13A + 3M. c Note that this vectorizes, and is also fine for cache c based machines. c--------------------------------------------------------------------- implicit none include 'globals.h' integer n1,n2,n3,k double precision u(n1,n2,n3),r(n1,n2,n3),c(0:3) integer i3, i2, i1 double precision r1(m), r2(m), r3, r4, r5, r6 do i3=2,n3-1 do i2=2,n2-1 do i1=2,n1-1 r3 = r(i1,i2-1,i3) + r(i1,i2+1,i3) > + r(i1,i2,i3-1) + r(i1,i2,i3+1) r4 = r(i1-1,i2-1,i3) + r(i1-1,i2+1,i3) > + r(i1-1,i2,i3-1) + r(i1-1,i2,i3+1) r5 = r(i1+1,i2-1,i3) + r(i1+1,i2+1,i3) > + r(i1+1,i2,i3-1) + r(i1+1,i2,i3+1) r6 = r(i1,i2-1,i3-1) + r(i1,i2+1,i3-1) > + r(i1,i2-1,i3+1) + r(i1,i2+1,i3+1) u(i1,i2,i3) = u(i1,i2,i3) > + c(0) * r(i1,i2,i3) > + c(1) * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) > + r3 ) > + c(2) * ( r6 + r4 + r5 ) enddo enddo enddo c--------------------------------------------------------------------- c exchange boundary points c--------------------------------------------------------------------- call comm3(u,n1,n2,n3,k) if( debug_vec(0) .ge. 1 )then call rep_nrm(u,n1,n2,n3,' psinv',k) endif if( debug_vec(3) .ge. k )then call showall(u,n1,n2,n3) endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine resid( u,v,r,n1,n2,n3,a,k ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c resid computes the residual: r = v - Au c c This implementation costs 15A + 4M per result, where c A and M denote the costs of Addition (or Subtraction) and c Multiplication, respectively. c Presuming coefficient a(1) is zero (the NPB assumes this, c but it is thus not a general case), 3A + 1M may be eliminated, c resulting in 12A + 3M. c Note that this vectorizes, and is also fine for cache c based machines. c--------------------------------------------------------------------- implicit none include 'globals.h' integer n1,n2,n3,k double precision u(n1,n2,n3),v(n1,n2,n3),r(n1,n2,n3),a(0:3) integer i3, i2, i1, i2block, i3block, BLOCK2, BLOCK3 parameter (BLOCK2=22, BLOCK3=23) double precision u3, u4, u5, u6, u7, u8,u9,u10 u8=a(0) u9=a(2) u10=a(3) !$acc region copyin(u,v,u8,u9,u10) copyout(r) !$acc do private(u3,u4,u5,u6,u7) do i3=2, n3-1 do i2=2, n2-1 do i1=2,n1-1 u3 = u(i1-1,i2-1,i3) + u(i1-1,i2+1,i3) > + u(i1-1,i2,i3-1) + u(i1-1,i2,i3+1) u4 = u(i1+1,i2-1,i3) + u(i1+1,i2+1,i3) > + u(i1+1,i2,i3-1) + u(i1+1,i2,i3+1) u5 = u(i1,i2-1,i3-1) + u(i1,i2+1,i3-1) > + u(i1,i2-1,i3+1) + u(i1,i2+1,i3+1) u6 = u(i1-1,i2-1,i3-1) + u(i1-1,i2+1,i3-1) > + u(i1-1,i2-1,i3+1) + u(i1-1,i2+1,i3+1) u7 = u(i1+1,i2-1,i3-1) + u(i1+1,i2+1,i3-1) > + u(i1+1,i2-1,i3+1) + u(i1+1,i2+1,i3+1) r(i1,i2,i3) = v(i1,i2,i3) > - u8 * u(i1,i2,i3) > - u9 * ( u5 + u3 + u4 ) > - u10 * ( u6 + u7 ) enddo enddo enddo !$acc end region c--------------------------------------------------------------------- c exchange boundary data c--------------------------------------------------------------------- call comm3(r,n1,n2,n3,k) if( debug_vec(0) .ge. 1 )then call rep_nrm(r,n1,n2,n3,' resid',k) endif if( debug_vec(2) .ge. k )then call showall(r,n1,n2,n3) endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine rprj3( r,m1k,m2k,m3k,s,m1j,m2j,m3j,k ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c rprj3 projects onto the next coarser grid, c using a trilinear Finite Element projection: s = r' = P r c c This implementation costs 20A + 4M per result, where c A and M denote the costs of Addition and Multiplication. c Note that this vectorizes, and is also fine for cache c based machines. c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer m1k, m2k, m3k, m1j, m2j, m3j,k double precision r(m1k,m2k,m3k), s(m1j,m2j,m3j) integer j3, j2, j1, i3, i2, i1, d1, d2, d3, j double precision x1(m), y1(m), x2,y2 if(m1k.eq.3)then d1 = 2 else d1 = 1 endif if(m2k.eq.3)then d2 = 2 else d2 = 1 endif if(m3k.eq.3)then d3 = 2 else d3 = 1 endif do j3=2,m3j-1 i3 = 2*j3-d3 C i3 = 2*j3-1 do j2=2,m2j-1 i2 = 2*j2-d2 C i2 = 2*j2-1 do j1=2,m1j i1 = 2*j1-d1 C i1 = 2*j1-1 x1(i1-1) = r(i1-1,i2-1,i3 ) + r(i1-1,i2+1,i3 ) > + r(i1-1,i2, i3-1) + r(i1-1,i2, i3+1) y1(i1-1) = r(i1-1,i2-1,i3-1) + r(i1-1,i2-1,i3+1) > + r(i1-1,i2+1,i3-1) + r(i1-1,i2+1,i3+1) enddo do j1=2,m1j-1 i1 = 2*j1-d1 C i1 = 2*j1-1 y2 = r(i1, i2-1,i3-1) + r(i1, i2-1,i3+1) > + r(i1, i2+1,i3-1) + r(i1, i2+1,i3+1) x2 = r(i1, i2-1,i3 ) + r(i1, i2+1,i3 ) > + r(i1, i2, i3-1) + r(i1, i2, i3+1) s(j1,j2,j3) = > 0.5D0 * r(i1,i2,i3) > + 0.25D0 * ( r(i1-1,i2,i3) + r(i1+1,i2,i3) + x2) > + 0.125D0 * ( x1(i1-1) + x1(i1+1) + y2) > + 0.0625D0 * ( y1(i1-1) + y1(i1+1) ) enddo enddo enddo j = k-1 call comm3(s,m1j,m2j,m3j,j) if( debug_vec(0) .ge. 1 )then call rep_nrm(s,m1j,m2j,m3j,' rprj3',k-1) endif if( debug_vec(4) .ge. k )then call showall(s,m1j,m2j,m3j) endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine interp( z,mm1,mm2,mm3,u,n1,n2,n3,k ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c interp adds the trilinear interpolation of the correction c from the coarser grid to the current approximation: u = u + Qu' c c Observe that this implementation costs 16A + 4M, where c A and M denote the costs of Addition and Multiplication. c Note that this vectorizes, and is also fine for cache c based machines. Vector machines may get slightly better c performance however, with 8 separate "do i1" loops, rather than 4. c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer mm1, mm2, mm3, n1, n2, n3,k double precision z(mm1,mm2,mm3),u(n1,n2,n3) integer i3, i2, i1, d1, d2, d3, t1, t2, t3 c note that m = 1037 in globals.h but for this only need to be c 535 to handle up to 1024^3 c integer m c parameter( m=535 ) double precision z1(m),z2(m),z3(m) if( n1 .ne. 3 .and. n2 .ne. 3 .and. n3 .ne. 3 ) then do i3=1,mm3-1 do i2=1,mm2-1 do i1=1,mm1 z1(i1) = z(i1,i2+1,i3) + z(i1,i2,i3) z2(i1) = z(i1,i2,i3+1) + z(i1,i2,i3) z3(i1) = z(i1,i2+1,i3+1) + z(i1,i2,i3+1) + z1(i1) enddo do i1=1,mm1-1 u(2*i1-1,2*i2-1,2*i3-1)=u(2*i1-1,2*i2-1,2*i3-1) > +z(i1,i2,i3) u(2*i1,2*i2-1,2*i3-1)=u(2*i1,2*i2-1,2*i3-1) > +0.5d0*(z(i1+1,i2,i3)+z(i1,i2,i3)) enddo do i1=1,mm1-1 u(2*i1-1,2*i2,2*i3-1)=u(2*i1-1,2*i2,2*i3-1) > +0.5d0 * z1(i1) u(2*i1,2*i2,2*i3-1)=u(2*i1,2*i2,2*i3-1) > +0.25d0*( z1(i1) + z1(i1+1) ) enddo do i1=1,mm1-1 u(2*i1-1,2*i2-1,2*i3)=u(2*i1-1,2*i2-1,2*i3) > +0.5d0 * z2(i1) u(2*i1,2*i2-1,2*i3)=u(2*i1,2*i2-1,2*i3) > +0.25d0*( z2(i1) + z2(i1+1) ) enddo do i1=1,mm1-1 u(2*i1-1,2*i2,2*i3)=u(2*i1-1,2*i2,2*i3) > +0.25d0* z3(i1) u(2*i1,2*i2,2*i3)=u(2*i1,2*i2,2*i3) > +0.125d0*( z3(i1) + z3(i1+1) ) enddo enddo enddo else if(n1.eq.3)then d1 = 2 t1 = 1 else d1 = 1 t1 = 0 endif if(n2.eq.3)then d2 = 2 t2 = 1 else d2 = 1 t2 = 0 endif if(n3.eq.3)then d3 = 2 t3 = 1 else d3 = 1 t3 = 0 endif do i3=d3,mm3-1 do i2=d2,mm2-1 do i1=d1,mm1-1 u(2*i1-d1,2*i2-d2,2*i3-d3)=u(2*i1-d1,2*i2-d2,2*i3-d3) > +z(i1,i2,i3) enddo do i1=1,mm1-1 u(2*i1-t1,2*i2-d2,2*i3-d3)=u(2*i1-t1,2*i2-d2,2*i3-d3) > +0.5D0*(z(i1+1,i2,i3)+z(i1,i2,i3)) enddo enddo do i2=1,mm2-1 do i1=d1,mm1-1 u(2*i1-d1,2*i2-t2,2*i3-d3)=u(2*i1-d1,2*i2-t2,2*i3-d3) > +0.5D0*(z(i1,i2+1,i3)+z(i1,i2,i3)) enddo do i1=1,mm1-1 u(2*i1-t1,2*i2-t2,2*i3-d3)=u(2*i1-t1,2*i2-t2,2*i3-d3) > +0.25D0*(z(i1+1,i2+1,i3)+z(i1+1,i2,i3) > +z(i1, i2+1,i3)+z(i1, i2,i3)) enddo enddo enddo do i3=1,mm3-1 do i2=d2,mm2-1 do i1=d1,mm1-1 u(2*i1-d1,2*i2-d2,2*i3-t3)=u(2*i1-d1,2*i2-d2,2*i3-t3) > +0.5D0*(z(i1,i2,i3+1)+z(i1,i2,i3)) enddo do i1=1,mm1-1 u(2*i1-t1,2*i2-d2,2*i3-t3)=u(2*i1-t1,2*i2-d2,2*i3-t3) > +0.25D0*(z(i1+1,i2,i3+1)+z(i1,i2,i3+1) > +z(i1+1,i2,i3 )+z(i1,i2,i3 )) enddo enddo do i2=1,mm2-1 do i1=d1,mm1-1 u(2*i1-d1,2*i2-t2,2*i3-t3)=u(2*i1-d1,2*i2-t2,2*i3-t3) > +0.25D0*(z(i1,i2+1,i3+1)+z(i1,i2,i3+1) > +z(i1,i2+1,i3 )+z(i1,i2,i3 )) enddo do i1=1,mm1-1 u(2*i1-t1,2*i2-t2,2*i3-t3)=u(2*i1-t1,2*i2-t2,2*i3-t3) > +0.125D0*(z(i1+1,i2+1,i3+1)+z(i1+1,i2,i3+1) > +z(i1 ,i2+1,i3+1)+z(i1 ,i2,i3+1) > +z(i1+1,i2+1,i3 )+z(i1+1,i2,i3 ) > +z(i1 ,i2+1,i3 )+z(i1 ,i2,i3 )) enddo enddo enddo endif call comm3_ex(u,n1,n2,n3,k) if( debug_vec(0) .ge. 1 )then call rep_nrm(z,mm1,mm2,mm3,'z: inter',k-1) call rep_nrm(u,n1,n2,n3,'u: inter',k) endif if( debug_vec(5) .ge. k )then call showall(z,mm1,mm2,mm3) call showall(u,n1,n2,n3) endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine norm2u3(r,n1,n2,n3,rnm2,rnmu,nx,ny,nz) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c norm2u3 evaluates approximations to the L2 norm and the c uniform (or L-infinity or Chebyshev) norm, under the c assumption that the boundaries are periodic or zero. Add the c boundaries in with half weight (quarter weight on the edges c and eighth weight at the corners) for inhomogeneous boundaries. c--------------------------------------------------------------------- implicit none include 'mpinpb.h' integer n1, n2, n3, nx, ny, nz double precision rnm2, rnmu, r(n1,n2,n3) double precision s, a, ss integer i3, i2, i1, ierr double precision dn dn = 1.0d0*nx*ny*nz s=0.0D0 rnmu = 0.0D0 do i3=2,n3-1 do i2=2,n2-1 do i1=2,n1-1 s=s+r(i1,i2,i3)**2 a=abs(r(i1,i2,i3)) if(a.gt.rnmu)rnmu=a enddo enddo enddo call mpi_allreduce(rnmu,ss,1,dp_type, > mpi_max,mpi_comm_world,ierr) rnmu = ss call mpi_allreduce(s, ss, 1, dp_type, > mpi_sum,mpi_comm_world,ierr) s = ss rnm2=sqrt( s / dn ) return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine rep_nrm(u,n1,n2,n3,title,kk) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c report on norm c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer n1, n2, n3, kk double precision u(n1,n2,n3) character*8 title double precision rnm2, rnmu call norm2u3(u,n1,n2,n3,rnm2,rnmu,nx(kk),ny(kk),nz(kk)) if( me .eq. root )then write(*,7)kk,title,rnm2,rnmu 7 format(' Level',i2,' in ',a8,': norms =',D21.14,D21.14) endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine comm3(u,n1,n2,n3,kk) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c comm3 organizes the communication on all borders c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer n1, n2, n3, kk double precision u(n1,n2,n3) integer axis if( .not. dead(kk) )then do axis = 1, 3 if( nprocs .ne. 1) then call ready( axis, -1, kk ) call ready( axis, +1, kk ) call give3( axis, +1, u, n1, n2, n3, kk ) call give3( axis, -1, u, n1, n2, n3, kk ) call take3( axis, -1, u, n1, n2, n3 ) call take3( axis, +1, u, n1, n2, n3 ) else call comm1p( axis, u, n1, n2, n3, kk ) endif enddo else call zero3(u,n1,n2,n3) endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine comm3_ex(u,n1,n2,n3,kk) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c comm3_ex communicates to expand the number of processors c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer n1, n2, n3, kk double precision u(n1,n2,n3) integer axis do axis = 1, 3 if( nprocs .ne. 1 ) then if( take_ex( axis, kk ) )then call ready( axis, -1, kk ) call ready( axis, +1, kk ) call take3_ex( axis, -1, u, n1, n2, n3 ) call take3_ex( axis, +1, u, n1, n2, n3 ) endif if( give_ex( axis, kk ) )then call give3_ex( axis, +1, u, n1, n2, n3, kk ) call give3_ex( axis, -1, u, n1, n2, n3, kk ) endif else call comm1p_ex( axis, u, n1, n2, n3, kk ) endif enddo return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine ready( axis, dir, k ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c ready allocates a buffer to take in a message c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer axis, dir, k integer buff_id,buff_len,i,ierr buff_id = 3 + dir buff_len = nm2 do i=1,nm2 buff(i,buff_id) = 0.0D0 enddo c--------------------------------------------------------------------- c fake message request type c--------------------------------------------------------------------- msg_id(axis,dir,1) = msg_type(axis,dir) +1000*me call mpi_irecv( buff(1,buff_id), buff_len, > dp_type, nbr(axis,-dir,k), msg_type(axis,dir), > mpi_comm_world, msg_id(axis,dir,1), ierr) return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine give3( axis, dir, u, n1, n2, n3, k ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c give3 sends border data out in the requested direction c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer axis, dir, n1, n2, n3, k, ierr double precision u( n1, n2, n3 ) integer i3, i2, i1, buff_len,buff_id buff_id = 2 + dir buff_len = 0 if( axis .eq. 1 )then if( dir .eq. -1 )then do i3=2,n3-1 do i2=2,n2-1 buff_len = buff_len + 1 buff(buff_len,buff_id ) = u( 2, i2,i3) enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) else if( dir .eq. +1 ) then do i3=2,n3-1 do i2=2,n2-1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( n1-1, i2,i3) enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) endif endif if( axis .eq. 2 )then if( dir .eq. -1 )then do i3=2,n3-1 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1, 2,i3) enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) else if( dir .eq. +1 ) then do i3=2,n3-1 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id )= u( i1,n2-1,i3) enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) endif endif if( axis .eq. 3 )then if( dir .eq. -1 )then do i2=1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1,i2,2) enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) else if( dir .eq. +1 ) then do i2=1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1,i2,n3-1) enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) endif endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine take3( axis, dir, u, n1, n2, n3 ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c take3 copies in border data from the requested direction c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer axis, dir, n1, n2, n3 double precision u( n1, n2, n3 ) integer buff_id, indx integer status(mpi_status_size), ierr integer i3, i2, i1 call mpi_wait( msg_id( axis, dir, 1 ),status,ierr) buff_id = 3 + dir indx = 0 if( axis .eq. 1 )then if( dir .eq. -1 )then do i3=2,n3-1 do i2=2,n2-1 indx = indx + 1 u(n1,i2,i3) = buff(indx, buff_id ) enddo enddo else if( dir .eq. +1 ) then do i3=2,n3-1 do i2=2,n2-1 indx = indx + 1 u(1,i2,i3) = buff(indx, buff_id ) enddo enddo endif endif if( axis .eq. 2 )then if( dir .eq. -1 )then do i3=2,n3-1 do i1=1,n1 indx = indx + 1 u(i1,n2,i3) = buff(indx, buff_id ) enddo enddo else if( dir .eq. +1 ) then do i3=2,n3-1 do i1=1,n1 indx = indx + 1 u(i1,1,i3) = buff(indx, buff_id ) enddo enddo endif endif if( axis .eq. 3 )then if( dir .eq. -1 )then do i2=1,n2 do i1=1,n1 indx = indx + 1 u(i1,i2,n3) = buff(indx, buff_id ) enddo enddo else if( dir .eq. +1 ) then do i2=1,n2 do i1=1,n1 indx = indx + 1 u(i1,i2,1) = buff(indx, buff_id ) enddo enddo endif endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine give3_ex( axis, dir, u, n1, n2, n3, k ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c give3_ex sends border data out to expand number of processors c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer axis, dir, n1, n2, n3, k, ierr double precision u( n1, n2, n3 ) integer i3, i2, i1, buff_len, buff_id buff_id = 2 + dir buff_len = 0 if( axis .eq. 1 )then if( dir .eq. -1 )then do i3=1,n3 do i2=1,n2 buff_len = buff_len + 1 buff(buff_len,buff_id ) = u( 2, i2,i3) enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) else if( dir .eq. +1 ) then do i3=1,n3 do i2=1,n2 do i1=n1-1,n1 buff_len = buff_len + 1 buff(buff_len,buff_id)= u(i1,i2,i3) enddo enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) endif endif if( axis .eq. 2 )then if( dir .eq. -1 )then do i3=1,n3 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1, 2,i3) enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) else if( dir .eq. +1 ) then do i3=1,n3 do i2=n2-1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len,buff_id )= u(i1,i2,i3) enddo enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) endif endif if( axis .eq. 3 )then if( dir .eq. -1 )then do i2=1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1,i2,2) enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) else if( dir .eq. +1 ) then do i3=n3-1,n3 do i2=1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1,i2,i3) enddo enddo enddo call mpi_send( > buff(1, buff_id ), buff_len,dp_type, > nbr( axis, dir, k ), msg_type(axis,dir), > mpi_comm_world, ierr) endif endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine take3_ex( axis, dir, u, n1, n2, n3 ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c take3_ex copies in border data to expand number of processors c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer axis, dir, n1, n2, n3 double precision u( n1, n2, n3 ) integer buff_id, indx integer status(mpi_status_size) , ierr integer i3, i2, i1 call mpi_wait( msg_id( axis, dir, 1 ),status,ierr) buff_id = 3 + dir indx = 0 if( axis .eq. 1 )then if( dir .eq. -1 )then do i3=1,n3 do i2=1,n2 indx = indx + 1 u(n1,i2,i3) = buff(indx, buff_id ) enddo enddo else if( dir .eq. +1 ) then do i3=1,n3 do i2=1,n2 do i1=1,2 indx = indx + 1 u(i1,i2,i3) = buff(indx,buff_id) enddo enddo enddo endif endif if( axis .eq. 2 )then if( dir .eq. -1 )then do i3=1,n3 do i1=1,n1 indx = indx + 1 u(i1,n2,i3) = buff(indx, buff_id ) enddo enddo else if( dir .eq. +1 ) then do i3=1,n3 do i2=1,2 do i1=1,n1 indx = indx + 1 u(i1,i2,i3) = buff(indx,buff_id) enddo enddo enddo endif endif if( axis .eq. 3 )then if( dir .eq. -1 )then do i2=1,n2 do i1=1,n1 indx = indx + 1 u(i1,i2,n3) = buff(indx, buff_id ) enddo enddo else if( dir .eq. +1 ) then do i3=1,2 do i2=1,n2 do i1=1,n1 indx = indx + 1 u(i1,i2,i3) = buff(indx,buff_id) enddo enddo enddo endif endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine comm1p( axis, u, n1, n2, n3, kk ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer axis, dir, n1, n2, n3 double precision u( n1, n2, n3 ) integer i3, i2, i1, buff_len,buff_id integer i, kk, indx dir = -1 buff_id = 3 + dir buff_len = nm2 do i=1,nm2 buff(i,buff_id) = 0.0D0 enddo dir = +1 buff_id = 3 + dir buff_len = nm2 do i=1,nm2 buff(i,buff_id) = 0.0D0 enddo dir = +1 buff_id = 2 + dir buff_len = 0 if( axis .eq. 1 )then do i3=2,n3-1 do i2=2,n2-1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( n1-1, i2,i3) enddo enddo endif if( axis .eq. 2 )then do i3=2,n3-1 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id )= u( i1,n2-1,i3) enddo enddo endif if( axis .eq. 3 )then do i2=1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1,i2,n3-1) enddo enddo endif dir = -1 buff_id = 2 + dir buff_len = 0 if( axis .eq. 1 )then do i3=2,n3-1 do i2=2,n2-1 buff_len = buff_len + 1 buff(buff_len,buff_id ) = u( 2, i2,i3) enddo enddo endif if( axis .eq. 2 )then do i3=2,n3-1 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1, 2,i3) enddo enddo endif if( axis .eq. 3 )then do i2=1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1,i2,2) enddo enddo endif do i=1,nm2 buff(i,4) = buff(i,3) buff(i,2) = buff(i,1) enddo dir = -1 buff_id = 3 + dir indx = 0 if( axis .eq. 1 )then do i3=2,n3-1 do i2=2,n2-1 indx = indx + 1 u(n1,i2,i3) = buff(indx, buff_id ) enddo enddo endif if( axis .eq. 2 )then do i3=2,n3-1 do i1=1,n1 indx = indx + 1 u(i1,n2,i3) = buff(indx, buff_id ) enddo enddo endif if( axis .eq. 3 )then do i2=1,n2 do i1=1,n1 indx = indx + 1 u(i1,i2,n3) = buff(indx, buff_id ) enddo enddo endif dir = +1 buff_id = 3 + dir indx = 0 if( axis .eq. 1 )then do i3=2,n3-1 do i2=2,n2-1 indx = indx + 1 u(1,i2,i3) = buff(indx, buff_id ) enddo enddo endif if( axis .eq. 2 )then do i3=2,n3-1 do i1=1,n1 indx = indx + 1 u(i1,1,i3) = buff(indx, buff_id ) enddo enddo endif if( axis .eq. 3 )then do i2=1,n2 do i1=1,n1 indx = indx + 1 u(i1,i2,1) = buff(indx, buff_id ) enddo enddo endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine comm1p_ex( axis, u, n1, n2, n3, kk ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- implicit none include 'mpinpb.h' include 'globals.h' integer axis, dir, n1, n2, n3 double precision u( n1, n2, n3 ) integer i3, i2, i1, buff_len,buff_id integer i, kk, indx if( take_ex( axis, kk ) ) then dir = -1 buff_id = 3 + dir buff_len = nm2 do i=1,nm2 buff(i,buff_id) = 0.0D0 enddo dir = +1 buff_id = 3 + dir buff_len = nm2 do i=1,nm2 buff(i,buff_id) = 0.0D0 enddo dir = -1 buff_id = 3 + dir indx = 0 if( axis .eq. 1 )then do i3=1,n3 do i2=1,n2 indx = indx + 1 u(n1,i2,i3) = buff(indx, buff_id ) enddo enddo endif if( axis .eq. 2 )then do i3=1,n3 do i1=1,n1 indx = indx + 1 u(i1,n2,i3) = buff(indx, buff_id ) enddo enddo endif if( axis .eq. 3 )then do i2=1,n2 do i1=1,n1 indx = indx + 1 u(i1,i2,n3) = buff(indx, buff_id ) enddo enddo endif dir = +1 buff_id = 3 + dir indx = 0 if( axis .eq. 1 )then do i3=1,n3 do i2=1,n2 do i1=1,2 indx = indx + 1 u(i1,i2,i3) = buff(indx,buff_id) enddo enddo enddo endif if( axis .eq. 2 )then do i3=1,n3 do i2=1,2 do i1=1,n1 indx = indx + 1 u(i1,i2,i3) = buff(indx,buff_id) enddo enddo enddo endif if( axis .eq. 3 )then do i3=1,2 do i2=1,n2 do i1=1,n1 indx = indx + 1 u(i1,i2,i3) = buff(indx,buff_id) enddo enddo enddo endif endif if( give_ex( axis, kk ) )then dir = +1 buff_id = 2 + dir buff_len = 0 if( axis .eq. 1 )then do i3=1,n3 do i2=1,n2 do i1=n1-1,n1 buff_len = buff_len + 1 buff(buff_len,buff_id)= u(i1,i2,i3) enddo enddo enddo endif if( axis .eq. 2 )then do i3=1,n3 do i2=n2-1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len,buff_id )= u(i1,i2,i3) enddo enddo enddo endif if( axis .eq. 3 )then do i3=n3-1,n3 do i2=1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1,i2,i3) enddo enddo enddo endif dir = -1 buff_id = 2 + dir buff_len = 0 if( axis .eq. 1 )then do i3=1,n3 do i2=1,n2 buff_len = buff_len + 1 buff(buff_len,buff_id ) = u( 2, i2,i3) enddo enddo endif if( axis .eq. 2 )then do i3=1,n3 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1, 2,i3) enddo enddo endif if( axis .eq. 3 )then do i2=1,n2 do i1=1,n1 buff_len = buff_len + 1 buff(buff_len, buff_id ) = u( i1,i2,2) enddo enddo endif endif do i=1,nm2 buff(i,4) = buff(i,3) buff(i,2) = buff(i,1) enddo return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine zran3(z,n1,n2,n3,nx,ny,k) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c zran3 loads +1 at ten randomly chosen points, c loads -1 at a different ten random points, c and zero elsewhere. c--------------------------------------------------------------------- implicit none include 'mpinpb.h' integer is1, is2, is3, ie1, ie2, ie3 common /grid/ is1,is2,is3,ie1,ie2,ie3 integer n1, n2, n3, k, nx, ny, ierr, i0, m0, m1 double precision z(n1,n2,n3) integer mm, i1, i2, i3, d1, e1, e2, e3 double precision x, a double precision xx, x0, x1, a1, a2, ai, power parameter( mm = 10, a = 5.D0 ** 13, x = 314159265.D0) double precision ten( mm, 0:1 ), temp, best integer i, j1( mm, 0:1 ), j2( mm, 0:1 ), j3( mm, 0:1 ) integer jg( 0:3, mm, 0:1 ), jg_temp(4) external randlc double precision randlc, rdummy a1 = power( a, nx, 1, 0 ) a2 = power( a, nx, ny, 0 ) call zero3(z,n1,n2,n3) c i = is1-2+nx*(is2-2+ny*(is3-2)) ai = power( a, nx, is2-2+ny*(is3-2), is1-2 ) d1 = ie1 - is1 + 1 e1 = ie1 - is1 + 2 e2 = ie2 - is2 + 2 e3 = ie3 - is3 + 2 x0 = x rdummy = randlc( x0, ai ) do i3 = 2, e3 x1 = x0 do i2 = 2, e2 xx = x1 call vranlc( d1, xx, a, z( 2, i2, i3 )) rdummy = randlc( x1, a1 ) enddo rdummy = randlc( x0, a2 ) enddo c--------------------------------------------------------------------- c call comm3(z,n1,n2,n3) c call showall(z,n1,n2,n3) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c each processor looks for twenty candidates c--------------------------------------------------------------------- do i=1,mm ten( i, 1 ) = 0.0D0 j1( i, 1 ) = 0 j2( i, 1 ) = 0 j3( i, 1 ) = 0 ten( i, 0 ) = 1.0D0 j1( i, 0 ) = 0 j2( i, 0 ) = 0 j3( i, 0 ) = 0 enddo do i3=2,n3-1 do i2=2,n2-1 do i1=2,n1-1 if( z(i1,i2,i3) .gt. ten( 1, 1 ) )then ten(1,1) = z(i1,i2,i3) j1(1,1) = i1 j2(1,1) = i2 j3(1,1) = i3 call bubble( ten, j1, j2, j3, mm, 1 ) endif if( z(i1,i2,i3) .lt. ten( 1, 0 ) )then ten(1,0) = z(i1,i2,i3) j1(1,0) = i1 j2(1,0) = i2 j3(1,0) = i3 call bubble( ten, j1, j2, j3, mm, 0 ) endif enddo enddo enddo call mpi_barrier(mpi_comm_world,ierr) c--------------------------------------------------------------------- c Now which of these are globally best? c--------------------------------------------------------------------- i1 = mm i0 = mm do i=mm,1,-1 best = z( j1(i1,1), j2(i1,1), j3(i1,1) ) call mpi_allreduce(best,temp,1,dp_type, > mpi_max,mpi_comm_world,ierr) best = temp if(best.eq.z(j1(i1,1),j2(i1,1),j3(i1,1)))then jg( 0, i, 1) = me jg( 1, i, 1) = is1 - 2 + j1( i1, 1 ) jg( 2, i, 1) = is2 - 2 + j2( i1, 1 ) jg( 3, i, 1) = is3 - 2 + j3( i1, 1 ) i1 = i1-1 else jg( 0, i, 1) = 0 jg( 1, i, 1) = 0 jg( 2, i, 1) = 0 jg( 3, i, 1) = 0 endif ten( i, 1 ) = best call mpi_allreduce(jg(0,i,1), jg_temp,4,MPI_INTEGER, > mpi_max,mpi_comm_world,ierr) jg( 0, i, 1) = jg_temp(1) jg( 1, i, 1) = jg_temp(2) jg( 2, i, 1) = jg_temp(3) jg( 3, i, 1) = jg_temp(4) best = z( j1(i0,0), j2(i0,0), j3(i0,0) ) call mpi_allreduce(best,temp,1,dp_type, > mpi_min,mpi_comm_world,ierr) best = temp if(best.eq.z(j1(i0,0),j2(i0,0),j3(i0,0)))then jg( 0, i, 0) = me jg( 1, i, 0) = is1 - 2 + j1( i0, 0 ) jg( 2, i, 0) = is2 - 2 + j2( i0, 0 ) jg( 3, i, 0) = is3 - 2 + j3( i0, 0 ) i0 = i0-1 else jg( 0, i, 0) = 0 jg( 1, i, 0) = 0 jg( 2, i, 0) = 0 jg( 3, i, 0) = 0 endif ten( i, 0 ) = best call mpi_allreduce(jg(0,i,0), jg_temp,4,MPI_INTEGER, > mpi_max,mpi_comm_world,ierr) jg( 0, i, 0) = jg_temp(1) jg( 1, i, 0) = jg_temp(2) jg( 2, i, 0) = jg_temp(3) jg( 3, i, 0) = jg_temp(4) enddo m1 = i1+1 m0 = i0+1 c if( me .eq. root) then c write(*,*)' ' c write(*,*)' negative charges at' c write(*,9)(jg(1,i,0),jg(2,i,0),jg(3,i,0),i=1,mm) c write(*,*)' positive charges at' c write(*,9)(jg(1,i,1),jg(2,i,1),jg(3,i,1),i=1,mm) c write(*,*)' small random numbers were' c write(*,8)(ten( i,0),i=mm,1,-1) c write(*,*)' and they were found on processor number' c write(*,7)(jg(0,i,0),i=mm,1,-1) c write(*,*)' large random numbers were' c write(*,8)(ten( i,1),i=mm,1,-1) c write(*,*)' and they were found on processor number' c write(*,7)(jg(0,i,1),i=mm,1,-1) c endif c 9 format(5(' (',i3,2(',',i3),')')) c 8 format(5D15.8) c 7 format(10i4) call mpi_barrier(mpi_comm_world,ierr) do i3=1,n3 do i2=1,n2 do i1=1,n1 z(i1,i2,i3) = 0.0D0 enddo enddo enddo do i=mm,m0,-1 z( j1(i,0), j2(i,0), j3(i,0) ) = -1.0D0 enddo do i=mm,m1,-1 z( j1(i,1), j2(i,1), j3(i,1) ) = +1.0D0 enddo call comm3(z,n1,n2,n3,k) c--------------------------------------------------------------------- c call showall(z,n1,n2,n3) c--------------------------------------------------------------------- return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine show_l(z,n1,n2,n3) c--------------------------------------------------------------------- c--------------------------------------------------------------------- implicit none include 'mpinpb.h' integer n1,n2,n3,i1,i2,i3,ierr double precision z(n1,n2,n3) integer m1, m2, m3,i m1 = min(n1,18) m2 = min(n2,14) m3 = min(n3,18) write(*,*)' ' do i=0,nprocs-1 if( me .eq. i )then write(*,*)' id = ', me do i3=1,m3 do i1=1,m1 write(*,6)(z(i1,i2,i3),i2=1,m2) enddo write(*,*)' - - - - - - - ' enddo write(*,*)' ' 6 format(6f15.11) endif call mpi_barrier(mpi_comm_world,ierr) enddo return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine showall(z,n1,n2,n3) c--------------------------------------------------------------------- c--------------------------------------------------------------------- implicit none include 'mpinpb.h' integer n1,n2,n3,i1,i2,i3,i,ierr double precision z(n1,n2,n3) integer m1, m2, m3 m1 = min(n1,18) m2 = min(n2,14) m3 = min(n3,18) write(*,*)' ' do i=0,nprocs-1 if( me .eq. i )then write(*,*)' id = ', me do i3=1,m3 do i1=1,m1 write(*,6)(z(i1,i2,i3),i2=1,m2) enddo write(*,*)' - - - - - - - ' enddo write(*,*)' ' 6 format(15f6.3) endif call mpi_barrier(mpi_comm_world,ierr) enddo return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine show(z,n1,n2,n3) c--------------------------------------------------------------------- c--------------------------------------------------------------------- implicit none include 'mpinpb.h' integer n1,n2,n3,i1,i2,i3,ierr,i double precision z(n1,n2,n3) write(*,*)' ' do i=0,nprocs-1 if( me .eq. i )then write(*,*)' id = ', me do i3=2,n3-1 do i1=2,n1-1 write(*,6)(z(i1,i2,i3),i2=2,n1-1) enddo write(*,*)' - - - - - - - ' enddo write(*,*)' ' 6 format(8D10.3) endif call mpi_barrier(mpi_comm_world,ierr) enddo c call comm3(z,n1,n2,n3) return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- double precision function power( a, n1, n2, n3 ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c power raises an integer, disguised as a double c precision real, to an integer power. c This version tries to avoid integer overflow by treating c it as expressed in a form of "n1*n2+n3". c--------------------------------------------------------------------- implicit none double precision a, aj integer n1, n2, n3 integer n1j, n2j, nj external randlc double precision randlc, rdummy power = 1.0d0 aj = a nj = n3 n1j = n1 n2j = n2 100 continue if( n2j .gt. 0 ) then if( mod(n2j,2) .eq. 1 ) nj = nj + n1j n2j = n2j/2 else if( nj .eq. 0 ) then go to 200 endif if( mod(nj,2) .eq. 1 ) rdummy = randlc( power, aj ) rdummy = randlc( aj, aj ) nj = nj/2 go to 100 200 continue return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine bubble( ten, j1, j2, j3, m, ind ) c--------------------------------------------------------------------- c--------------------------------------------------------------------- c--------------------------------------------------------------------- c bubble does a bubble sort in direction dir c--------------------------------------------------------------------- implicit none include 'mpinpb.h' integer m, ind, j1( m, 0:1 ), j2( m, 0:1 ), j3( m, 0:1 ) double precision ten( m, 0:1 ) double precision temp integer i, j_temp if( ind .eq. 1 )then do i=1,m-1 if( ten(i,ind) .gt. ten(i+1,ind) )then temp = ten( i+1, ind ) ten( i+1, ind ) = ten( i, ind ) ten( i, ind ) = temp j_temp = j1( i+1, ind ) j1( i+1, ind ) = j1( i, ind ) j1( i, ind ) = j_temp j_temp = j2( i+1, ind ) j2( i+1, ind ) = j2( i, ind ) j2( i, ind ) = j_temp j_temp = j3( i+1, ind ) j3( i+1, ind ) = j3( i, ind ) j3( i, ind ) = j_temp else return endif enddo else do i=1,m-1 if( ten(i,ind) .lt. ten(i+1,ind) )then temp = ten( i+1, ind ) ten( i+1, ind ) = ten( i, ind ) ten( i, ind ) = temp j_temp = j1( i+1, ind ) j1( i+1, ind ) = j1( i, ind ) j1( i, ind ) = j_temp j_temp = j2( i+1, ind ) j2( i+1, ind ) = j2( i, ind ) j2( i, ind ) = j_temp j_temp = j3( i+1, ind ) j3( i+1, ind ) = j3( i, ind ) j3( i, ind ) = j_temp else return endif enddo endif return end c--------------------------------------------------------------------- c--------------------------------------------------------------------- subroutine zero3(z,n1,n2,n3) c--------------------------------------------------------------------- c--------------------------------------------------------------------- implicit none include 'mpinpb.h' integer n1, n2, n3 double precision z(n1,n2,n3) integer i1, i2, i3 do i3=1,n3 do i2=1,n2 do i1=1,n1 z(i1,i2,i3)=0.0D0 enddo enddo enddo return end c----- end of program ------------------------------------------------