This step entails: (1) creating an SPE thread of execution on the PPE, (2) migrating the computation loops from Vector/SIMD Multimedia Extension intrinsics to SPU intrinsic, and finally (3) adding DMA transfers to move data in and out of the SPE's local store (LS).
We assume that the particle data structures cannot be restructured into SOA form. Therefore, we use Step 1a from the previous section (the AOS form). SPU intrinsics are used, and they can be identified by their prefix spu_.
#define END_OF_TIME 10 #define PARTICLES 100000 typedef struct { float x, y, z, w; } vec4D; typedef struct { int particles; // number of particles to process vector float *pos_v; // pointer to array of position vectors vector float *vel_v; // pointer to array of velocity vectors float *inv_mass; // pointer to array of mass vectors vector float force_v; // force vector float dt; // current step in time } context;
######################################################################## # Subdirectories ######################################################################## DIRS := spu ######################################################################## # Target ######################################################################## PROGRAM_ppu := euler_spe ######################################################################## # Local Defines ######################################################################## IMPORTS := spu/lib_particle_spu.a -lspe2 -lpthread ######################################################################## # make.footer ######################################################################## ifdef CELL_TOP include $(CELL_TOP)/buildutils/make.footer else include ../../../../../buildutils/make.footer endif
#include <stdio.h> #include <stdlib.h> #include <libspe2.h> #include <pthread.h> #include "particle.h" vec4D pos[PARTICLES] __attribute__ ((aligned (16))); vec4D vel[PARTICLES] __attribute__ ((aligned (16))); vec4D force __attribute__ ((aligned (16))); float inv_mass[PARTICLES] __attribute__ ((aligned (16))); float dt = 1.0f; extern spe_program_handle_t particle; typedef struct ppu_pthread_data { spe_context_ptr_t spe_ctx; pthread_t pthread; unsigned int entry; void *argp; } ppu_pthread_data_t; void *ppu_pthread_function(void *arg) { ppu_pthread_data_t *datap = (ppu_pthread_data_t *)arg; if (spe_context_run(datap->spe_ctx, &datap->entry, 0, datap->argp, NULL, NULL) < 0) { perror ("Failed running context\n"); exit (1); } pthread_exit(NULL); } int main() { ppu_pthread_data_t data; parm_context ctx __attribute__ ((aligned (16))); ctx.particles = PARTICLES; ctx.pos_v = (vector float *)pos; ctx.vel_v = (vector float *)vel; ctx.force_v = *((vector float *)&force); ctx.inv_mass = inv_mass; ctx.dt = dt; /* Create a SPE context */ if ((data.spe_ctx = spe_context_create (0, NULL)) == NULL) { perror ("Failed creating context"); exit (1); } /* Load SPE program into the SPE context*/ if (spe_program_load (data.spe_ctx, &particle)) { perror ("Failed loading program"); exit (1); } /* Initialize context run data */ data.entry = SPE_DEFAULT_ENTRY; data.argp = &ctx; /* Create pthread for each of the SPE contexts */ if (pthread_create (&data.pthread, NULL, &ppu_pthread_function, &data)) { perror ("Failed creating thread"); exit (1); } /* Wait for the threads to complete */ if (pthread_join (data.pthread, NULL)) { perror ("Failed joining thread\n"); exit (1); } return (0); }
######################################################################## # Target ######################################################################## PROGRAM_spu := particle LIBRARY_embed := lib_particle_spu.a ######################################################################## # Local Defines ######################################################################## INCLUDE := -I .. ######################################################################## # make.footer ######################################################################## ifdef CELL_TOP include $(CELL_TOP)/buildutils/make.footer else include ../../../../../../buildutils/make.footer endif
#include <spu_intrinsics.h> #include <spu_mfcio.h> #include "particle.h" #define PARTICLES_PER_BLOCK 1024 // Local store structures and buffers. volatile context ctx; volatile vector float pos[PARTICLES_PER_BLOCK]; volatile vector float vel[PARTICLES_PER_BLOCK]; volatile float inv_mass[PARTICLES_PER_BLOCK]; int main(unsigned long long spe_id, unsigned long long parm) { int i, j; int left, cnt; float time; unsigned int tag_id; vector float dt_v, dt_inv_mass_v; /* Reserve a tag ID */ tag_id = mfc_tag_reserve(); spu_writech(MFC_WrTagMask, -1); // Input parameter parm is a pointer to the particle context. // Fetch the context, waiting for it to complete. spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(context), tag_id, MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); dt_v = spu_splats(ctx.dt); // For each step in time for (time=0; time<END_OF_TIME; time += ctx.dt) { // For each block of particles for (i=0; i<ctx.particles; i+=PARTICLES_PER_BLOCK) { // Determine the number of particles in this block. left = ctx.particles - i; cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK; // Fetch the data - position, velocity, inverse_mass. Wait for DMA to // complete before performing computation. spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_GET_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_GET_CMD); spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx.inv_mass+i), cnt * sizeof(float), tag_id, MFC_GET_CMD); (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); // Compute the step in time for the block of particles for (j=0; j<cnt; j++) { pos[j] = spu_madd(vel[j], dt_v, pos[j]); dt_inv_mass_v = spu_mul(dt_v, spu_splats(inv_mass[j])); vel[j] = spu_madd(dt_inv_mass_v, ctx.force_v, vel[j]); } // Put the position and velocity data back into main storage spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD); spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * sizeof(vector float), tag_id, MFC_PUT_CMD); } } // Wait for final DMAs to complete before terminating SPE thread. (void)spu_mfcstat(MFC_TAG_UPDATE_ALL); return (0); }