Step 2: Port the PPE code for execution on the SPE

This step entails: (1) creating an SPE thread of execution on the PPE, (2) migrating the computation loops from Vector/SIMD Multimedia Extension intrinsics to SPU intrinsic, and finally (3) adding DMA transfers to move data in and out of the SPE's local store (LS).

We assume that the particle data structures cannot be restructured into SOA form. Therefore, we use Step 1a from the previous section (the AOS form). SPU intrinsics are used, and they can be identified by their prefix spu_.

Moving the code from the PPE to the SPE requires:
particle.h:
#define END_OF_TIME     10
#define PARTICLES       100000

typedef struct {
  float x, y, z, w;  
} vec4D;

typedef struct {
  int particles;        // number of particles to process
  vector float *pos_v;  // pointer to array of position vectors
  vector float *vel_v;  // pointer to array of velocity vectors
  float *inv_mass;      // pointer to array of mass vectors
  vector float force_v; // force vector
  float dt;             // current step in time
} context;
PPE Makefile:
########################################################################
#                       Subdirectories
########################################################################

DIRS		:= spu

########################################################################
#                       Target
########################################################################

PROGRAM_ppu		:= euler_spe

########################################################################
#                       Local Defines
########################################################################

IMPORTS         := spu/lib_particle_spu.a -lspe2 -lpthread

########################################################################
#                       make.footer
########################################################################

ifdef CELL_TOP
	include $(CELL_TOP)/buildutils/make.footer
else
	include ../../../../../buildutils/make.footer
endif
PPE Code:
#include <stdio.h>
#include <stdlib.h>
#include <libspe2.h>
#include <pthread.h>
#include "particle.h"

vec4D pos[PARTICLES] __attribute__ ((aligned (16)));
vec4D vel[PARTICLES] __attribute__ ((aligned (16)));
vec4D force __attribute__ ((aligned (16)));
float inv_mass[PARTICLES] __attribute__ ((aligned (16)));
float dt = 1.0f;

extern spe_program_handle_t particle;

typedef struct ppu_pthread_data {
  spe_context_ptr_t spe_ctx;
  pthread_t pthread;
  unsigned int entry;
  void *argp;
} ppu_pthread_data_t;


void *ppu_pthread_function(void *arg) {
  ppu_pthread_data_t *datap = (ppu_pthread_data_t *)arg;

  if (spe_context_run(datap->spe_ctx, &datap->entry, 0, 
    datap->argp, NULL, NULL) < 0) {
    perror ("Failed running context\n");
    exit (1);
  }
  pthread_exit(NULL);
}

int main()
{
  ppu_pthread_data_t data;
  parm_context ctx __attribute__ ((aligned (16)));

  ctx.particles = PARTICLES;
  ctx.pos_v = (vector float *)pos;
  ctx.vel_v = (vector float *)vel;
  ctx.force_v = *((vector float *)&force);
  ctx.inv_mass = inv_mass;
  ctx.dt = dt;

  /* Create a SPE context */
  if ((data.spe_ctx = spe_context_create (0, NULL)) == NULL) {
    perror ("Failed creating context");
    exit (1);
  }
  /* Load SPE program into the SPE context*/
  if (spe_program_load (data.spe_ctx, &particle))  {
    perror ("Failed loading program");
    exit (1);
  }
  /* Initialize context run data */
  data.entry = SPE_DEFAULT_ENTRY;
  data.argp = &ctx;
  /* Create pthread for each of the SPE contexts */
  if (pthread_create (&data.pthread, NULL, &ppu_pthread_function, &data)) {
    perror ("Failed creating thread");
    exit (1);
  }
  /* Wait for the threads to complete */
  if (pthread_join (data.pthread, NULL)) {
    perror ("Failed joining thread\n");
    exit (1);
  }
  return (0);
}
SPE Makefile:
########################################################################
#			Target
########################################################################

PROGRAM_spu      := particle
LIBRARY_embed    := lib_particle_spu.a

########################################################################
#			Local Defines
########################################################################

INCLUDE			:= -I ..

########################################################################
#			make.footer
########################################################################

ifdef CELL_TOP
	include $(CELL_TOP)/buildutils/make.footer
else
	include ../../../../../../buildutils/make.footer
endif
SPE Code:
#include <spu_intrinsics.h>
#include <spu_mfcio.h>
#include "particle.h"

#define PARTICLES_PER_BLOCK             1024

// Local store structures and buffers.
volatile context ctx;
volatile vector float pos[PARTICLES_PER_BLOCK];
volatile vector float vel[PARTICLES_PER_BLOCK];
volatile float inv_mass[PARTICLES_PER_BLOCK];

int main(unsigned long long spe_id, unsigned long long parm)
{
  int i, j;
  int left, cnt;
  float time;
  unsigned int tag_id; 	
  vector float dt_v, dt_inv_mass_v;  

	/* Reserve a tag ID */ 	
  tag_id = mfc_tag_reserve();

  spu_writech(MFC_WrTagMask, -1);

  // Input parameter parm is a pointer to the particle context.
  // Fetch the context, waiting for it to complete.
  spu_mfcdma32((void *)(&ctx), (unsigned int)parm, sizeof(context), 
    tag_id, MFC_GET_CMD);
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

  dt_v = spu_splats(ctx.dt);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += ctx.dt) {
    // For each block of particles
    for (i=0; i<ctx.particles; i+=PARTICLES_PER_BLOCK) {
      // Determine the number of particles in this block.
      left = ctx.particles - i;
      cnt = (left < PARTICLES_PER_BLOCK) ? left : PARTICLES_PER_BLOCK;

      // Fetch the data - position, velocity, inverse_mass. Wait for DMA to  
      // complete before performing computation.
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * 
        sizeof(vector float), tag_id, MFC_GET_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * 
        sizeof(vector float), tag_id, MFC_GET_CMD);
      spu_mfcdma32((void *)(inv_mass), (unsigned int)(ctx.inv_mass+i), cnt * 
        sizeof(float), tag_id, MFC_GET_CMD);
      (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);

      // Compute the step in time for the block of particles
      for (j=0; j<cnt; j++) {
        pos[j] = spu_madd(vel[j], dt_v, pos[j]);
        dt_inv_mass_v = spu_mul(dt_v, spu_splats(inv_mass[j]));
        vel[j] = spu_madd(dt_inv_mass_v, ctx.force_v, vel[j]);
      }

      // Put the position and velocity data back into main storage
      spu_mfcdma32((void *)(pos), (unsigned int)(ctx.pos_v+i), cnt * 
        sizeof(vector float), tag_id, MFC_PUT_CMD);
      spu_mfcdma32((void *)(vel), (unsigned int)(ctx.vel_v+i), cnt * 
        sizeof(vector float), tag_id, MFC_PUT_CMD);
    }
  }
  // Wait for final DMAs to complete before terminating SPE thread.
  (void)spu_mfcstat(MFC_TAG_UPDATE_ALL);
  return (0);
}