/////////////////////////////////////////////////////////////////////// // The SPU file for matrix maltiplication for irs benchmark. // Implements rmatmult3 in a more efficient manner // Authors: Bala & Suraj /////////////////////////////////////////////////////////////////////// #include #include #include #include #include "Domain.h" #include "RadiationData.h" #include typedef struct { RadiationData_t rblk; Domain_t domain; double *x; double *b; } spu_parameter_t; volatile spu_parameter_t spu_param __attribute__ ((aligned (16))); inline void mfc_get_dbl_ul (double buf[], double **dst, unsigned int src, int tag_id) // this if for nonaligned memory read { if (src & 0xF) // Unaligned { mfc_get (buf, src & 0xFFFFFFF0, 2 * sizeof (double), tag_id, 0, 0); *dst = buf + 1; } else // Aligned { mfc_get (buf, src, sizeof (double), tag_id, 0, 0); *dst = buf; } } inline void mfc_put_dbl_ul (unsigned int dst, double data) // non aligned memory write { int tag_id = 1; char spu_data [128] __attribute__ ((aligned (16))); //printf("#0x%x\n", dst); if ((dst & 0xF) == 0) // Aligned { memcpy(spu_data, &data, sizeof(double)); mfc_put (spu_data, dst, sizeof (double), tag_id, 0, 0); } else if ((dst & 0xF) == 4) // Unaligned. Within the first 16 bytes { mfc_get(spu_data, dst & 0xFFFFFFF0, 2 * sizeof (double), tag_id, 0, 0); spu_writech (MFC_WrTagMask, 1 << tag_id); spu_mfcstat (MFC_TAG_UPDATE_ALL); memcpy(spu_data+4, &data, sizeof(double)); mfc_put (spu_data, dst & 0xFFFFFFF0, 2 * sizeof (double), tag_id, 0, 0); } else if ((dst & 0xF) == 8) // Unaligned { mfc_get(spu_data, dst & 0xFFFFFFF0, 2 * sizeof (double), tag_id, 0, 0); spu_writech (MFC_WrTagMask, 1 << tag_id); spu_mfcstat (MFC_TAG_UPDATE_ALL); memcpy(spu_data+8, &data, sizeof(double)); mfc_put (spu_data, dst & 0xFFFFFFF0, 2 * sizeof (double), tag_id, 0, 0); } else if ((dst & 0xF) == 12) // Unaligned { mfc_get(spu_data, dst & 0xFFFFFFF0, 4 * sizeof (double), tag_id, 0, 0); spu_writech (MFC_WrTagMask, 1 << tag_id); spu_mfcstat (MFC_TAG_UPDATE_ALL); memcpy(spu_data+12, &data, sizeof(double)); mfc_put (spu_data, dst & 0xFFFFFFF0, 4 * sizeof (double), tag_id, 0, 0); } else printf("Unknown alignment %d\n", dst & 15); spu_writech (MFC_WrTagMask, 1 << tag_id); spu_mfcstat (MFC_TAG_UPDATE_ALL); } int main (unsigned long long speid, unsigned long long argp) { const int tag_id = 0; int i, ii, jj, kk; int imin, imax, jmin, jmax, kmin, kmax; int jp, kp; double *dbl, *dbc, *dbr, *dcl, *dcc, *dcr, *dfl, *dfr, *dfc, *cbl, *cbc, *cbr, *ccl, *ccc, *ccr; double *cfl, *cfc, *cfr, *ubl, *ubc, *ubr, *ucl, *ucc, *ucr, *ufl, *ufc, *ufr; double *xdbl, *xdbc, *xdbr, *xdcl, *xdcc, *xdcr, *xdfl, *xdfc, *xdfr; double *xcbl, *xcbr, *xcbc, *xccl, *xccc, *xccr, *xcfl, *xcfc, *xcfr; double *xubl, *xubc, *xubr, *xucl, *xucc, *xucr, *xufl, *xufc, *xufr; double ib[2]; double *x, *b; double dummy1[4] __attribute__ ((aligned (16))); // Just to start the 16 byte alignment double idbl_buf[4], idbc_buf[4], idbr_buf[4], idcl_buf[4], idcc_buf[4], idcr_buf[4], idfl_buf[4], idfc_buf[4], idfr_buf[4], icbl_buf[4], icbc_buf[4], icbr_buf[4], iccl_buf[4], iccc_buf[4], iccr_buf[4], icfl_buf[4], icfc_buf[4], icfr_buf[4], iubl_buf[4], iubc_buf[4], iubr_buf[4], iucl_buf[4], iucc_buf[4], iucr_buf[4], iufl_buf[4], iufc_buf[4], iufr_buf[4]; double ixdbl_buf[4], ixdbc_buf[4], ixdbr_buf[4], ixdcl_buf[4], ixdcc_buf[4], ixdcr_buf[4], ixdfl_buf[4], ixdfc_buf[4], ixdfr_buf[4], ixcbl_buf[4], ixcbc_buf[4], ixcbr_buf[4], ixccl_buf[4], ixccc_buf[4], ixccr_buf[4], ixcfl_buf[4], ixcfc_buf[4], ixcfr_buf[4], ixubl_buf[4], ixubc_buf[4], ixubr_buf[4], ixucl_buf[4], ixucc_buf[4], ixucr_buf[4], ixufl_buf[4], ixufc_buf[4], ixufr_buf[4]; double *idbl, *idbc, *idbr, *idcl, *idcc, *idcr, *idfl, *idfc, *idfr, *icbl, *icbc, *icbr, *iccl, *iccc, *iccr, *icfl, *icfc, *icfr, *iubl, *iubc, *iubr, *iucl, *iucc, *iucr, *iufl, *iufc, *iufr; double *ixdbl, *ixdbc, *ixdbr, *ixdcl, *ixdcc, *ixdcr, *ixdfl, *ixdfc, *ixdfr, *ixcbl, *ixcbc, *ixcbr, *ixccl, *ixccc, *ixccr, *ixcfl, *ixcfc, *ixcfr, *ixubl, *ixubc, *ixubr, *ixucl, *ixucc, *ixucr, *ixufl, *ixufc, *ixufr; int t = 0; // tag_id = mfc_tag_reserve(); //DMA the full spu argument structure spu_mfcdma32 ((void *) (&spu_param), (unsigned int) argp, sizeof (spu_parameter_t), tag_id, MFC_GET_CMD); spu_writech (MFC_WrTagMask, 1 << tag_id); spu_mfcstat (MFC_TAG_UPDATE_ALL); imin = spu_param.domain.imin; imax = spu_param.domain.imax; jmin = spu_param.domain.jmin; jmax = spu_param.domain.jmax; kmin = spu_param.domain.kmin; kmax = spu_param.domain.kmax; jp = spu_param.domain.jp; kp = spu_param.domain.kp; // printf // ("imin = %d \n imax = %d \n jmin = %d \njmax = %d \nkmin = %d \nkmax = %d \njp = %d \nkp = %d \n", // imin, imax, jmin, jmax, kmin, kmax, jp, kp); x = spu_param.x; dbl = spu_param.rblk.dbl; dbc = spu_param.rblk.dbc; dbr = spu_param.rblk.dbr; dcl = spu_param.rblk.dcl; dcc = spu_param.rblk.dcc; dcr = spu_param.rblk.dcr; dfl = spu_param.rblk.dfl; dfc = spu_param.rblk.dfc; dfr = spu_param.rblk.dfr; cbl = spu_param.rblk.cbl; cbc = spu_param.rblk.cbc; cbr = spu_param.rblk.cbr; ccl = spu_param.rblk.ccl; ccc = spu_param.rblk.ccc; ccr = spu_param.rblk.ccr; cfl = spu_param.rblk.cfl; cfc = spu_param.rblk.cfc; cfr = spu_param.rblk.cfr; ubl = spu_param.rblk.ubl; ubc = spu_param.rblk.ubc; ubr = spu_param.rblk.ubr; ucl = spu_param.rblk.ucl; ucc = spu_param.rblk.ucc; ucr = spu_param.rblk.ucr; ufl = spu_param.rblk.ufl; ufc = spu_param.rblk.ufc; ufr = spu_param.rblk.ufr; xdbl = x - kp - jp - 1; xdbc = x - kp - jp; xdbr = x - kp - jp + 1; xdcl = x - kp - 1; xdcc = x - kp; xdcr = x - kp + 1; xdfl = x - kp + jp - 1; xdfc = x - kp + jp; xdfr = x - kp + jp + 1; xcbl = x - jp - 1; xcbc = x - jp; xcbr = x - jp + 1; xccl = x - 1; xccc = x; xccr = x + 1; xcfl = x + jp - 1; xcfc = x + jp; xcfr = x + jp + 1; xubl = x + kp - jp - 1; xubc = x + kp - jp; xubr = x + kp - jp + 1; xucl = x + kp - 1; xucc = x + kp; xucr = x + kp + 1; xufl = x + kp + jp - 1; xufc = x + kp + jp; xufr = x + kp + jp + 1; b = spu_param.b; i = imin + jj * jp + kk * kp; for (kk = kmin; kk < kmax; kk++) { // optimization to reduce the number of additions kk_optimized = kk * kp; //printf("#kk%d\n", kk); for (jj = jmin; jj < jmax; jj++) { // optimization to reduce the number of additions jj_optimized = jj * jp; // this is the memory prefetch state that is done. //printf("#jj%d\n", jj); i = imin + jj * jp + kk * kp; mfc_get_dbl_ul (&ixdbl_buf[t], &ixdbl, (unsigned int) &xdbl[i], tag_id); mfc_get_dbl_ul (&ixdbc_buf[t], &ixdbc, (unsigned int) &xdbc[i], tag_id); mfc_get_dbl_ul (&ixdbr_buf[t], &ixdbr, (unsigned int) &xdbr[i], tag_id); mfc_get_dbl_ul (&ixdcl_buf[t], &ixdcl, (unsigned int) &xdcl[i], tag_id); mfc_get_dbl_ul (&ixdcc_buf[t], &ixdcc, (unsigned int) &xdcc[i], tag_id); mfc_get_dbl_ul (&ixdcr_buf[t], &ixdcr, (unsigned int) &xdcr[i], tag_id); mfc_get_dbl_ul (&ixdfl_buf[t], &ixdfl, (unsigned int) &xdfl[i], tag_id); mfc_get_dbl_ul (&ixdfc_buf[t], &ixdfc, (unsigned int) &xdfc[i], tag_id); mfc_get_dbl_ul (&ixdfr_buf[t], &ixdfr, (unsigned int) &xdfr[i], tag_id); mfc_get_dbl_ul (&ixcbl_buf[t], &ixcbl, (unsigned int) &xcbl[i], tag_id); mfc_get_dbl_ul (&ixcbc_buf[t], &ixcbc, (unsigned int) &xcbc[i], tag_id); mfc_get_dbl_ul (&ixcbr_buf[t], &ixcbr, (unsigned int) &xcbr[i], tag_id); mfc_get_dbl_ul (&ixccl_buf[t], &ixccl, (unsigned int) &xccl[i], tag_id); mfc_get_dbl_ul (&ixccc_buf[t], &ixccc, (unsigned int) &xccc[i], tag_id); mfc_get_dbl_ul (&ixccr_buf[t], &ixccr, (unsigned int) &xccr[i], tag_id); mfc_get_dbl_ul (&ixcfl_buf[t], &ixcfl, (unsigned int) &xcfl[i], tag_id); mfc_get_dbl_ul (&ixcfc_buf[t], &ixcfc, (unsigned int) &xcfc[i], tag_id); mfc_get_dbl_ul (&ixcfr_buf[t], &ixcfr, (unsigned int) &xcfr[i], tag_id); mfc_get_dbl_ul (&ixubl_buf[t], &ixubl, (unsigned int) &xubl[i], tag_id); mfc_get_dbl_ul (&ixubc_buf[t], &ixubc, (unsigned int) &xubc[i], tag_id); mfc_get_dbl_ul (&ixubr_buf[t], &ixubr, (unsigned int) &xubr[i], tag_id); mfc_get_dbl_ul (&ixucl_buf[t], &ixucl, (unsigned int) &xucl[i], tag_id); mfc_get_dbl_ul (&ixucc_buf[t], &ixucc, (unsigned int) &xucc[i], tag_id); mfc_get_dbl_ul (&ixucr_buf[t], &ixucr, (unsigned int) &xucr[i], tag_id); mfc_get_dbl_ul (&ixufl_buf[t], &ixufl, (unsigned int) &xufl[i], tag_id); mfc_get_dbl_ul (&ixufc_buf[t], &ixufc, (unsigned int) &xufc[i], tag_id); mfc_get_dbl_ul (&ixufr_buf[t], &ixufr, (unsigned int) &xufr[i], tag_id); mfc_get_dbl_ul (&idbl_buf[t], &idbl, (unsigned int) &dbl[i], tag_id); mfc_get_dbl_ul (&idbc_buf[t], &idbc, (unsigned int) &dbc[i], tag_id); mfc_get_dbl_ul (&idbr_buf[t], &idbr, (unsigned int) &dbr[i], tag_id); mfc_get_dbl_ul (&idcl_buf[t], &idcl, (unsigned int) &dcl[i], tag_id); mfc_get_dbl_ul (&idcc_buf[t], &idcc, (unsigned int) &dcc[i], tag_id); mfc_get_dbl_ul (&idcr_buf[t], &idcr, (unsigned int) &dcr[i], tag_id); mfc_get_dbl_ul (&idfl_buf[t], &idfl, (unsigned int) &dfl[i], tag_id); mfc_get_dbl_ul (&idfc_buf[t], &idfc, (unsigned int) &dfc[i], tag_id); mfc_get_dbl_ul (&idfr_buf[t], &idfr, (unsigned int) &dfr[i], tag_id); mfc_get_dbl_ul (&icbl_buf[t], &icbl, (unsigned int) &cbl[i], tag_id); mfc_get_dbl_ul (&icbc_buf[t], &icbc, (unsigned int) &cbc[i], tag_id); mfc_get_dbl_ul (&icbr_buf[t], &icbr, (unsigned int) &cbr[i], tag_id); mfc_get_dbl_ul (&iccl_buf[t], &iccl, (unsigned int) &ccl[i], tag_id); mfc_get_dbl_ul (&iccc_buf[t], &iccc, (unsigned int) &ccc[i], tag_id); mfc_get_dbl_ul (&iccr_buf[t], &iccr, (unsigned int) &ccr[i], tag_id); mfc_get_dbl_ul (&icfl_buf[t], &icfl, (unsigned int) &cfl[i], tag_id); mfc_get_dbl_ul (&icfc_buf[t], &icfc, (unsigned int) &cfc[i], tag_id); mfc_get_dbl_ul (&icfr_buf[t], &icfr, (unsigned int) &cfr[i], tag_id); mfc_get_dbl_ul (&iubl_buf[t], &iubl, (unsigned int) &ubl[i], tag_id); mfc_get_dbl_ul (&iubc_buf[t], &iubc, (unsigned int) &ubc[i], tag_id); mfc_get_dbl_ul (&iubr_buf[t], &iubr, (unsigned int) &ubr[i], tag_id); mfc_get_dbl_ul (&iucl_buf[t], &iucl, (unsigned int) &ucl[i], tag_id); mfc_get_dbl_ul (&iucc_buf[t], &iucc, (unsigned int) &ucc[i], tag_id); mfc_get_dbl_ul (&iucr_buf[t], &iucr, (unsigned int) &ucr[i], tag_id); mfc_get_dbl_ul (&iufl_buf[t], &iufl, (unsigned int) &ufl[i], tag_id); mfc_get_dbl_ul (&iufc_buf[t], &iufc, (unsigned int) &ufc[i], tag_id); mfc_get_dbl_ul (&iufr_buf[t], &iufr, (unsigned int) &ufr[i], tag_id); for (ii = imin; ii < imax; ii++) { // printf("#ii%d\n", ii); // optimized calculation i = ii + jj_optimized + kk_optimized; // toggle to toggle between the memory prefetch t = t ^ 2; spu_writech (MFC_WrTagMask, 1 << tag_id); spu_mfcstat (MFC_TAG_UPDATE_ALL); if (ii < (imax - 1)) { mfc_get_dbl_ul (&ixdbl_buf[t], &ixdbl, (unsigned int) &xdbl[i + 1], tag_id); mfc_get_dbl_ul (&ixdbc_buf[t], &ixdbc, (unsigned int) &xdbc[i + 1], tag_id); mfc_get_dbl_ul (&ixdbr_buf[t], &ixdbr, (unsigned int) &xdbr[i + 1], tag_id); mfc_get_dbl_ul (&ixdcl_buf[t], &ixdcl, (unsigned int) &xdcl[i + 1], tag_id); mfc_get_dbl_ul (&ixdcc_buf[t], &ixdcc, (unsigned int) &xdcc[i + 1], tag_id); mfc_get_dbl_ul (&ixdcr_buf[t], &ixdcr, (unsigned int) &xdcr[i + 1], tag_id); mfc_get_dbl_ul (&ixdfl_buf[t], &ixdfl, (unsigned int) &xdfl[i + 1], tag_id); mfc_get_dbl_ul (&ixdfc_buf[t], &ixdfc, (unsigned int) &xdfc[i + 1], tag_id); mfc_get_dbl_ul (&ixdfr_buf[t], &ixdfr, (unsigned int) &xdfr[i + 1], tag_id); mfc_get_dbl_ul (&ixcbl_buf[t], &ixcbl, (unsigned int) &xcbl[i + 1], tag_id); mfc_get_dbl_ul (&ixcbc_buf[t], &ixcbc, (unsigned int) &xcbc[i + 1], tag_id); mfc_get_dbl_ul (&ixcbr_buf[t], &ixcbr, (unsigned int) &xcbr[i + 1], tag_id); mfc_get_dbl_ul (&ixccl_buf[t], &ixccl, (unsigned int) &xccl[i + 1], tag_id); mfc_get_dbl_ul (&ixccc_buf[t], &ixccc, (unsigned int) &xccc[i + 1], tag_id); mfc_get_dbl_ul (&ixccr_buf[t], &ixccr, (unsigned int) &xccr[i + 1], tag_id); mfc_get_dbl_ul (&ixcfl_buf[t], &ixcfl, (unsigned int) &xcfl[i + 1], tag_id); mfc_get_dbl_ul (&ixcfc_buf[t], &ixcfc, (unsigned int) &xcfc[i + 1], tag_id); mfc_get_dbl_ul (&ixcfr_buf[t], &ixcfr, (unsigned int) &xcfr[i + 1], tag_id); mfc_get_dbl_ul (&ixubl_buf[t], &ixubl, (unsigned int) &xubl[i + 1], tag_id); mfc_get_dbl_ul (&ixubc_buf[t], &ixubc, (unsigned int) &xubc[i + 1], tag_id); mfc_get_dbl_ul (&ixubr_buf[t], &ixubr, (unsigned int) &xubr[i + 1], tag_id); mfc_get_dbl_ul (&ixucl_buf[t], &ixucl, (unsigned int) &xucl[i + 1], tag_id); mfc_get_dbl_ul (&ixucc_buf[t], &ixucc, (unsigned int) &xucc[i + 1], tag_id); mfc_get_dbl_ul (&ixucr_buf[t], &ixucr, (unsigned int) &xucr[i + 1], tag_id); mfc_get_dbl_ul (&ixufl_buf[t], &ixufl, (unsigned int) &xufl[i + 1], tag_id); mfc_get_dbl_ul (&ixufc_buf[t], &ixufc, (unsigned int) &xufc[i + 1], tag_id); mfc_get_dbl_ul (&ixufr_buf[t], &ixufr, (unsigned int) &xufr[i + 1], tag_id); mfc_get_dbl_ul (&idbl_buf[t], &idbl, (unsigned int) &dbl[i + 1], tag_id); mfc_get_dbl_ul (&idbc_buf[t], &idbc, (unsigned int) &dbc[i + 1], tag_id); mfc_get_dbl_ul (&idbr_buf[t], &idbr, (unsigned int) &dbr[i + 1], tag_id); mfc_get_dbl_ul (&idcl_buf[t], &idcl, (unsigned int) &dcl[i + 1], tag_id); mfc_get_dbl_ul (&idcc_buf[t], &idcc, (unsigned int) &dcc[i + 1], tag_id); mfc_get_dbl_ul (&idcr_buf[t], &idcr, (unsigned int) &dcr[i + 1], tag_id); mfc_get_dbl_ul (&idfl_buf[t], &idfl, (unsigned int) &dfl[i + 1], tag_id); mfc_get_dbl_ul (&idfc_buf[t], &idfc, (unsigned int) &dfc[i + 1], tag_id); mfc_get_dbl_ul (&idfr_buf[t], &idfr, (unsigned int) &dfr[i + 1], tag_id); mfc_get_dbl_ul (&icbl_buf[t], &icbl, (unsigned int) &cbl[i + 1], tag_id); mfc_get_dbl_ul (&icbc_buf[t], &icbc, (unsigned int) &cbc[i + 1], tag_id); mfc_get_dbl_ul (&icbr_buf[t], &icbr, (unsigned int) &cbr[i + 1], tag_id); mfc_get_dbl_ul (&iccl_buf[t], &iccl, (unsigned int) &ccl[i + 1], tag_id); mfc_get_dbl_ul (&iccc_buf[t], &iccc, (unsigned int) &ccc[i + 1], tag_id); mfc_get_dbl_ul (&iccr_buf[t], &iccr, (unsigned int) &ccr[i + 1], tag_id); mfc_get_dbl_ul (&icfl_buf[t], &icfl, (unsigned int) &cfl[i + 1], tag_id); mfc_get_dbl_ul (&icfc_buf[t], &icfc, (unsigned int) &cfc[i + 1], tag_id); mfc_get_dbl_ul (&icfr_buf[t], &icfr, (unsigned int) &cfr[i + 1], tag_id); mfc_get_dbl_ul (&iubl_buf[t], &iubl, (unsigned int) &ubl[i + 1], tag_id); mfc_get_dbl_ul (&iubc_buf[t], &iubc, (unsigned int) &ubc[i + 1], tag_id); mfc_get_dbl_ul (&iubr_buf[t], &iubr, (unsigned int) &ubr[i + 1], tag_id); mfc_get_dbl_ul (&iucl_buf[t], &iucl, (unsigned int) &ucl[i + 1], tag_id); mfc_get_dbl_ul (&iucc_buf[t], &iucc, (unsigned int) &ucc[i + 1], tag_id); mfc_get_dbl_ul (&iucr_buf[t], &iucr, (unsigned int) &ucr[i + 1], tag_id); mfc_get_dbl_ul (&iufl_buf[t], &iufl, (unsigned int) &ufl[i + 1], tag_id); mfc_get_dbl_ul (&iufc_buf[t], &iufc, (unsigned int) &ufc[i + 1], tag_id); mfc_get_dbl_ul (&iufr_buf[t], &iufr, (unsigned int) &ufr[i + 1], tag_id); } // toggle back to compute on the previously prefetched data t = t ^ 2; ib[t] = idbl[t] * ixdbl[t] + idbc[t] * ixdbc[t] + idbr[t] * ixdbr[t] + idcl[t] * ixdcl[t] + idcc[t] * ixdcc[t] + idcr[t] * ixdcr[t] + idfl[t] * ixdfl[t] + idfc[t] * ixdfc[t] + idfr[t] * ixdfr[t] + icbl[t] * ixcbl[t] + icbc[t] * ixcbc[t] + icbr[t] * ixcbr[t] + iccl[t] * ixccl[t] + iccc[t] * ixccc[t] + iccr[t] * ixccr[t] + icfl[t] * ixcfl[t] + icfc[t] * ixcfc[t] + icfr[t] * ixcfr[t] + iubl[t] * ixubl[t] + iubc[t] * ixubc[t] + iubr[t] * ixubr[t] + iucl[t] * ixucl[t] + iucc[t] * ixucc[t] + iucr[t] * ixucr[t] + iufl[t] * ixufl[t] + iufc[t] * ixufc[t] + iufr[t] * ixufr[t]; mfc_put_dbl_ul((unsigned int)&b[i], ib[t]); t = t ^ 2; } spu_writech (MFC_WrTagMask, 1 << tag_id); spu_mfcstat (MFC_TAG_UPDATE_ALL); } } //spu_writech (MFC_WrTagMask, 1 << tag_id); //spu_mfcstat (MFC_TAG_UPDATE_ALL); return 0; }