#include #include #include #include #include #include "irs.h" #include "irssys.h" #include "irscom.h" #include "irsdefs.h" #include "irstdiff.h" #include "DiffusionData.h" #include "FunctionTimer.h" #include "RadiationData.h" #include "TimeStepControl.h" extern spe_program_handle_t spu_irs; #define SPU_CALL_SITE1 1 #define SPU_CALL_SITE2 0 #define SPU_CALL_SITE3 0 #define SPU_CALL_SITE4 0 typedef struct ppu_pthread_data // structure to hold the thread data. { Domain_t *domain; spe_context_ptr_t spe_ctx; pthread_t pthread; void *argp; } ppu_pthread_data_t; typedef struct { RadiationData_t rblk; Domain_t domain; double *x; double *b; } spu_parameter_t; static ppu_pthread_data_t ppu_data[6] __attribute__ ((aligned (16))); // Maximum of 6 blocks static spu_parameter_t spu_arg[6] __attribute__ ((aligned (16))); void * ppu_pthread_function (void *arg) { char *me = "rmatmult3"; ppu_pthread_data_t *datap = (ppu_pthread_data_t *) arg; Domain_t *domain = datap->domain; double myflops; unsigned int entry = SPE_DEFAULT_ENTRY; printf ("entering spu\n"); FT_INITIALIZE (me, domain->hash); if (spe_context_run (datap->spe_ctx, &entry, 0, datap->argp, NULL, NULL) < 0) { perror ("Failed running context"); exit (1); } myflops = 53.0 * (domain->imax - domain->imin) * (domain->jmax - domain->jmin) * (domain->kmax - domain->kmin); FT_FINALIZE (me, domain->hash, myflops); pthread_exit (NULL); } static void MatrixSolveCG (RadiationData_t * rblk, RadiationData_t * cblk, DiffusionData_t * tblk, double eps, int maxit, int *iterations, double *error); static void MatrixSolvePC2D (RadiationData_t * cblk, RadiationData_t * rblk, Domain_t * domain); static void MatrixSolvePC3D (RadiationData_t * cblk, RadiationData_t * rblk, Domain_t * domain); void MatrixSolveDriver (RadiationData_t * rblk, RadiationData_t * cblk, DiffusionData_t * tblk, Domain_t * domains, int *iterations, double *error) { char *me = "MatrixSolveDriver"; int iblk; int comerr = 0; int my_nblk = nblk; double myflops = 0.0; FT_INITIALIZE (me, gv_hash_tbl); if (solver == 1) { } else { if (ifprecon == 0) { #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; for (i = domains[iblk].frz; i <= domains[iblk].lrz; i++) { cblk[iblk].ccc[i] = 1.0; } } } if (ifprecon > 0) { #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; for (i = domains[iblk].frz; i <= domains[iblk].lrz; i++) { cblk[iblk].ccc[i] = 1.0 / (rblk[iblk].ccc[i] + ptiny); } myflops += 5.0 * (domains[iblk].lrz - domains[iblk].frz + 1); } } comerr += rbndcom (RBNDCOM6, COM_RECV, 0); comerr += rbndcom (RBNDCOM6, COM_SEND, 0); comerr += rbndcom (RBNDCOM6, COM_WAIT_RECV, 0); comerr += rbndcom (RBNDCOM6, COM_WAIT_SEND, 0); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { } if (ifprecon >= 2) { if (ndims == 2) { #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { MatrixSolvePC2D (&cblk[iblk], &rblk[iblk], &domains[iblk]); } } if (ndims == 3) { #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { MatrixSolvePC3D (&cblk[iblk], &rblk[iblk], &domains[iblk]); } } } if (ifprecon > 1) { #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { cblkbc (&cblk[iblk], &domains[iblk]); } } MatrixSolveCG (rblk, cblk, tblk, rdifepsx, rdifitx, iterations, error); } FT_FINALIZE (me, gv_hash_tbl, myflops); } static int first_time = 1; static void MatrixSolveCG (RadiationData_t * rblk, RadiationData_t * cblk, DiffusionData_t * tblk, double eps, int maxit, int *iterations, double *error) { char *me = "MatrixSolveCG"; int iter, iblk, comerr; int my_nblk = nblk; double cgsums[4]; double dotp, dotprev, dotrz; double alpha, beta; double normx, normp, normr, normrhs; double rerr, xerr; double **x, **b, **p, **r, **t, **z; double myflops = 0.0; FT_INITIALIZE (me, gv_hash_tbl); // Create a maximum of 6 SPE contexts if (first_time) { first_time = 0; for (iter = 0; iter < 6; iter++) { if ((ppu_data[iter].spe_ctx = spe_context_create (0, NULL)) == NULL) { perror ("Failed creating context"); exit (1); } if (spe_program_load (ppu_data[iter].spe_ctx, &spu_irs)) { perror ("Failed loading program"); exit (1); } } } comerr = 0; x = ALLOT (double *, nblk); b = ALLOT (double *, nblk); p = ALLOT (double *, nblk); r = ALLOT (double *, nblk); t = ALLOT (double *, nblk); z = ALLOT (double *, nblk); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { x[iblk] = tblk[iblk].phi; b[iblk] = tblk[iblk].rhs; p[iblk] = tblk[iblk].dphi; r[iblk] = tblk[iblk].adphi; } #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; i = domains[iblk].nnalls + 1; t[iblk] = (double *) memalign (16, sizeof (double) * i); //MALLOT(double,i) ; z[iblk] = (double *) memalign (16, sizeof (double) * i); //MALLOT(double,i) ; for (i = 0; i <= domains[iblk].nnalls; i++) { p[iblk][i] = 0.0; r[iblk][i] = 0.0; t[iblk][i] = 0.0; z[iblk][i] = 0.0; } } comerr += rbndcom (RBNDCOM1, COM_RECV, 0); comerr += rbndcom (RBNDCOM1, COM_SEND, 0); comerr += rbndcom (RBNDCOM1, COM_WAIT_RECV, 0); comerr += rbndcom (RBNDCOM1, COM_WAIT_SEND, 0); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { ; } #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { setpz2 (x[iblk], 0.0, &domains[iblk]); } printf ("Block 1: count %d\n", my_nblk); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { #if (SPU_CALL_SITE1 == 1) if (ndims == 2) { rmatmult2 (&domains[iblk], &rblk[iblk], x[iblk], r[iblk]); } else { memcpy (&spu_arg[iblk].domain, &domains[iblk], sizeof (Domain_t)); memcpy (&spu_arg[iblk].rblk, &rblk[iblk], sizeof (RadiationData_t)); spu_arg[iblk].x = (double *) x[iblk]; spu_arg[iblk].b = (double *) r[iblk]; ppu_data[iblk].domain = &domains[iblk]; ppu_data[iblk].argp = &spu_arg[iblk]; if (pthread_create (&ppu_data[iblk].pthread, NULL, &ppu_pthread_function, &ppu_data[iblk])) { perror ("Failed creating thread"); } } #else // TODO: Wait for the SPU function to complete rmatmult (&domains[iblk], &rblk[iblk], x[iblk], r[iblk]); #endif } #if (SPU_CALL_SITE1 == 1) if (ndims != 2) { for (iblk = 0; iblk < my_nblk; iblk++) { if (pthread_join (ppu_data[iblk].pthread, NULL)) { perror ("Failed joining thread"); exit (1); } } } #endif #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; int is; int ie; is = domains[iblk].frz; ie = domains[iblk].lrz; for (i = is; i <= ie; i++) { r[iblk][i] = b[iblk][i] - r[iblk][i]; } myflops += 1.0 * (ie - is + 1); } comerr += rbndcom (RBNDCOM5, COM_RECV, 0); comerr += rbndcom (RBNDCOM5, COM_SEND, 0); comerr += rbndcom (RBNDCOM5, COM_WAIT_RECV, 0); comerr += rbndcom (RBNDCOM5, COM_WAIT_SEND, 0); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { ; } #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { setpz2 (r[iblk], 0.0, &domains[iblk]); } printf ("Block 2: count %d\n", my_nblk); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { #if (SPU_CALL_SITE2 == 1) if (ndims == 2) { rmatmult2 (&domains[iblk], &cblk[iblk], r[iblk], z[iblk]); } else { memcpy (&spu_arg[iblk].domain, &domains[iblk], sizeof (Domain_t)); memcpy (&spu_arg[iblk].rblk, &cblk[iblk], sizeof (RadiationData_t)); spu_arg[iblk].x = (double *) r[iblk]; spu_arg[iblk].b = (double *) z[iblk]; ppu_data[iblk].domain = &domains[iblk]; ppu_data[iblk].argp = &spu_arg[iblk]; if (pthread_create (&ppu_data[iblk].pthread, NULL, &ppu_pthread_function, &ppu_data[iblk])) { perror ("Failed creating thread"); } } #else rmatmult (&domains[iblk], &cblk[iblk], r[iblk], z[iblk]); #endif } #if (SPU_CALL_SITE2 == 1) if (ndims != 2) { for (iblk = 0; iblk < my_nblk; iblk++) { if (pthread_join (ppu_data[iblk].pthread, NULL)) { perror ("Failed joining thread"); exit (1); } } } #endif #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { setpz1 (z[iblk], 0.0, &domains[iblk]); setpz2 (z[iblk], 0.0, &domains[iblk]); } #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; int is; int ie; is = domains[iblk].frz; ie = domains[iblk].lrz; for (i = is; i <= ie; i++) { p[iblk][i] = z[iblk][i]; } } dotprev = 0.0; normrhs = 0.0; #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; for (i = 0; i < domains[iblk].nnalls; i++) { if (fabs (r[iblk][i]) < ptiny) r[iblk][i] = 0.0; if (fabs (z[iblk][i]) < ptiny) z[iblk][i] = 0.0; if (fabs (b[iblk][i]) < ptiny) b[iblk][i] = 0.0; } } for (iblk = 0; iblk < nblk; iblk++) { dotprev += icdot (r[iblk], z[iblk], domains[iblk].nnalls, &myflops); normrhs += norml2 (&domains[iblk], b[iblk]); } cgsums[0] = dotprev; cgsums[1] = normrhs; if (ifparallel) { comreduce (cgsums, 2, COM_SUM, COM_ALL, COM_DOUBLE); } dotprev = cgsums[0]; normrhs = cgsums[1]; normrhs = sqrt (normrhs) + ptiny; printf ("Max iterations %d\n", maxit); for (iter = 1; iter <= maxit; iter++) { comerr += rbndcom (RBNDCOM1, COM_RECV, 0); comerr += rbndcom (RBNDCOM1, COM_SEND, 0); comerr += rbndcom (RBNDCOM1, COM_WAIT_RECV, 0); comerr += rbndcom (RBNDCOM1, COM_WAIT_SEND, 0); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { ; } #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { setpz2 (p[iblk], 0.0, &domains[iblk]); } //printf ("Block 3: count %d\n", my_nblk); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { #if (SPU_CALL_SITE3 == 1) if (ndims == 2) { rmatmult2 (&domains[iblk], &rblk[iblk], p[iblk], t[iblk]); } else { memcpy (&spu_arg[iblk].domain, &domains[iblk], sizeof (Domain_t)); memcpy (&spu_arg[iblk].rblk, &rblk[iblk], sizeof (RadiationData_t)); spu_arg[iblk].x = (double *) p[iblk]; spu_arg[iblk].b = (double *) t[iblk]; ppu_data[iblk].domain = &domains[iblk]; ppu_data[iblk].argp = &spu_arg[iblk]; if (pthread_create (&ppu_data[iblk].pthread, NULL, &ppu_pthread_function, &ppu_data[iblk])) { perror ("Failed creating thread"); } } #else rmatmult (&domains[iblk], &rblk[iblk], p[iblk], t[iblk]); #endif } #if (SPU_CALL_SITE3 == 1) if (ndims != 2) { for (iblk = 0; iblk < my_nblk; iblk++) { if (pthread_join (ppu_data[iblk].pthread, NULL)) { perror ("Failed joining thread"); exit (1); } } } #endif #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { setpz1 (p[iblk], 0.0, &domains[iblk]); setpz1 (t[iblk], 0.0, &domains[iblk]); setpz2 (t[iblk], 0.0, &domains[iblk]); } dotp = 0.0; #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; for (i = 0; i < domains[iblk].nnalls; i++) { if (fabs (p[iblk][i]) < ptiny) p[iblk][i] = 0.0; if (fabs (t[iblk][i]) < ptiny) t[iblk][i] = 0.0; } } for (iblk = 0; iblk < nblk; iblk++) { dotp += icdot (p[iblk], t[iblk], domains[iblk].nnalls, &myflops); } if (ifparallel) { comreduce (&dotp, 1, COM_SUM, COM_ALL, COM_DOUBLE); } alpha = dotprev / (dotp + ptiny); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; int is; int ie; is = domains[iblk].frz; ie = domains[iblk].lrz; for (i = is; i <= ie; i++) { x[iblk][i] = x[iblk][i] + alpha * p[iblk][i]; r[iblk][i] = r[iblk][i] - alpha * t[iblk][i]; } myflops += 4.0 * (ie - is + 1); } comerr += rbndcom (RBNDCOM5, COM_RECV, 0); comerr += rbndcom (RBNDCOM5, COM_SEND, 0); comerr += rbndcom (RBNDCOM5, COM_WAIT_RECV, 0); comerr += rbndcom (RBNDCOM5, COM_WAIT_SEND, 0); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { ; } #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { setpz2 (r[iblk], 0.0, &domains[iblk]); } //printf ("Block 4: count %d\n", my_nblk); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { #if (SPU_CALL_SITE4 == 1) if (ndims == 2) { rmatmult2 (&domains[iblk], &cblk[iblk], r[iblk], z[iblk]); } else { memcpy (&spu_arg[iblk].domain, &domains[iblk], sizeof (Domain_t)); memcpy (&spu_arg[iblk].rblk, &cblk[iblk], sizeof (RadiationData_t)); spu_arg[iblk].x = (double *) r[iblk]; spu_arg[iblk].b = (double *) z[iblk]; ppu_data[iblk].domain = &domains[iblk]; ppu_data[iblk].argp = &spu_arg[iblk]; if (pthread_create (&ppu_data[iblk].pthread, NULL, &ppu_pthread_function, &ppu_data[iblk])) { perror ("Failed creating thread"); } } #else rmatmult (&domains[iblk], &cblk[iblk], r[iblk], z[iblk]); #endif } #if (SPU_CALL_SITE4 == 1) if (ndims != 2) { for (iblk = 0; iblk < my_nblk; iblk++) { if (pthread_join (ppu_data[iblk].pthread, NULL)) { perror ("Failed joining thread"); exit (1); } } } #endif #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { setpz1 (x[iblk], 0.0, &domains[iblk]); setpz2 (x[iblk], 0.0, &domains[iblk]); setpz1 (r[iblk], 0.0, &domains[iblk]); setpz1 (z[iblk], 0.0, &domains[iblk]); setpz2 (z[iblk], 0.0, &domains[iblk]); } dotrz = 0.0; normx = 0.0; normp = 0.0; normr = 0.0; #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; for (i = 0; i < domains[iblk].nnalls; i++) { if (fabs (p[iblk][i]) < ptiny) p[iblk][i] = 0.0; if (fabs (r[iblk][i]) < ptiny) r[iblk][i] = 0.0; if (fabs (x[iblk][i]) < ptiny) x[iblk][i] = 0.0; if (fabs (z[iblk][i]) < ptiny) z[iblk][i] = 0.0; } } for (iblk = 0; iblk < nblk; iblk++) { dotrz += icdot (r[iblk], z[iblk], domains[iblk].nnalls, &myflops); normx += norml2 (&domains[iblk], x[iblk]); normp += norml2 (&domains[iblk], p[iblk]); normr += norml2 (&domains[iblk], r[iblk]); } cgsums[0] = dotrz; cgsums[1] = normx; cgsums[2] = normp; cgsums[3] = normr; if (ifparallel) { comreduce (cgsums, 4, COM_SUM, COM_ALL, COM_DOUBLE); } dotrz = cgsums[0]; normx = cgsums[1]; normp = cgsums[2]; normr = cgsums[3]; normx = sqrt (normx); normp = sqrt (normp); normr = sqrt (normr); xerr = fabs (alpha) * normp / (normx + ptiny); rerr = normr / normrhs; if ((xerr < eps) && (rerr < eps) && (iter >= rdifitn)) { break; } beta = dotrz / (dotprev + ptiny); dotprev = dotrz; #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; int is; int ie; is = domains[iblk].frz; ie = domains[iblk].lrz; for (i = is; i <= ie; i++) { p[iblk][i] = z[iblk][i] + beta * p[iblk][i]; } myflops += 2.0 * (ie - is + 1); } } *iterations = iter; *error = MAX (xerr, rerr); #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { int i; int zone_max = -1; double value_max = -plarge; TimeStepControl_t *tsc = TimeStepControl_find ("TSC_Residual", domains[iblk].hash); for (i = domains[iblk].frz; i <= domains[iblk].lrz; i++) { if (fabs (r[iblk][i]) > value_max) { zone_max = i; value_max = fabs (r[iblk][i]); } } tsc->value = 1.0 / (value_max + ptiny); if (ndims == 2) NDXEXT2D (zone_max, tsc->i, tsc->j, domains[iblk]); if (ndims == 3) NDXEXT3D (zone_max, tsc->i, tsc->j, tsc->k, domains[iblk]); myflops += 1.0 * (domains[iblk].lrz - domains[iblk].frz + 1); } #include "pardo.h" for (iblk = 0; iblk < my_nblk; iblk++) { FREEMEM (t[iblk]); FREEMEM (z[iblk]); } FREEMEM (x); FREEMEM (b); FREEMEM (p); FREEMEM (r); FREEMEM (t); FREEMEM (z); FT_FINALIZE (me, gv_hash_tbl, myflops); } static void MatrixSolvePC2D (RadiationData_t * cblk, RadiationData_t * rblk, Domain_t * domain) { char *me = "MatrixSolvePC2D"; int i, jp, frz, lpz; double myflops = 0.0; FT_INITIALIZE (me, domain->hash); jp = domain->jp; frz = domain->frz; lpz = domain->lpz; for (i = frz; i <= lpz; i++) { cblk->ccl[i] -= cblk->ccc[i] * rblk->ccl[i] * cblk->ccc[i - 1]; cblk->cbc[i] -= cblk->ccc[i] * rblk->cbc[i] * cblk->ccc[i - jp]; cblk->cbl[i] -= cblk->ccc[i] * rblk->cbl[i] * cblk->ccc[i - 1 - jp]; cblk->cbr[i] -= cblk->ccc[i] * rblk->cbr[i] * cblk->ccc[i + 1 - jp]; } myflops += 12.0 * (lpz - frz + 1); FT_FINALIZE (me, domain->hash, myflops); } static void MatrixSolvePC3D (RadiationData_t * cblk, RadiationData_t * rblk, Domain_t * domain) { char *me = "MatrixSolvePC3D"; double *ccci, *ddbl, *ddbc, *ddbr, *ddcl, *ddcc, *ddcr, *ddfl, *ddfc, *ddfr; double *dcbl, *dcbc, *dcbr, *dccl; int comerr; int i; double myflops = 0.0; FT_INITIALIZE (me, domain->hash); dccl = cblk->ccc - 1; dcbr = cblk->ccc - domain->jp + 1; dcbc = cblk->ccc - domain->jp; dcbl = cblk->ccc - domain->jp - 1; ddfr = cblk->ccc - domain->kp + domain->jp + 1; ddfc = cblk->ccc - domain->kp + domain->jp; ddfl = cblk->ccc - domain->kp + domain->jp - 1; ddcr = cblk->ccc - domain->kp + 1; ddcc = cblk->ccc - domain->kp; ddcl = cblk->ccc - domain->kp - 1; ddbr = cblk->ccc - domain->kp - domain->jp + 1; ddbc = cblk->ccc - domain->kp - domain->jp; ddbl = cblk->ccc - domain->kp - domain->jp - 1; for (i = domain->frz; i <= domain->lpz; i++) { cblk->ccl[i] = -rblk->ccl[i] * cblk->ccc[i] * dccl[i]; cblk->cbr[i] = -rblk->cbr[i] * cblk->ccc[i] * dcbr[i]; cblk->cbc[i] = -rblk->cbc[i] * cblk->ccc[i] * dcbc[i]; cblk->cbl[i] = -rblk->cbl[i] * cblk->ccc[i] * dcbl[i]; cblk->dfr[i] = -rblk->dfr[i] * cblk->ccc[i] * ddfr[i]; cblk->dfc[i] = -rblk->dfc[i] * cblk->ccc[i] * ddfc[i]; cblk->dfl[i] = -rblk->dfl[i] * cblk->ccc[i] * ddfl[i]; cblk->dcr[i] = -rblk->dcr[i] * cblk->ccc[i] * ddcr[i]; cblk->dcc[i] = -rblk->dcc[i] * cblk->ccc[i] * ddcc[i]; cblk->dcl[i] = -rblk->dcl[i] * cblk->ccc[i] * ddcl[i]; cblk->dbr[i] = -rblk->dbr[i] * cblk->ccc[i] * ddbr[i]; cblk->dbc[i] = -rblk->dbc[i] * cblk->ccc[i] * ddbc[i]; cblk->dbl[i] = -rblk->dbl[i] * cblk->ccc[i] * ddbl[i]; } myflops += 28.0 * (domain->lpz - domain->frz + 1); FT_FINALIZE (me, domain->hash, myflops); }