Step 1: SIMDize the code for execution on the PPE

There are multiple strategies for SIMDizing code for execution either on the PPE's VXU or on an SPE's SPU unit. The technique chosen depends upon the type of data being operated on and the interdependencies of the data computations.

There are several strategies to consider:

Assuming the compiler auto-SIMDization is either unavailable or ineffective, you must adjust the data structures for efficient SIMD access. This decision cannot be made without also considering the SPE data-accessing method and the data-parallelization method. In addition, data should be aligned or padded for efficient quadword accesses, using the aligned attribute.

Step 1a: SIMDize in Array-of-Structures Form for Vector/SIMD Multimedia Extension

The following example shows how to SIMDize in the AOS form. Vector/SIMD Multimedia Extension intrinsics are used, and they can be identified by their prefix vec_. The algorithm assumes that the number of particles is a multiple of four. Special code must be included to handle the last number of particles that is not a multiple of four.
#define END_OF_TIME     10
#define PARTICLES       100000

typedef struct {
  float x, y, z, w;  
} vec4D;
vec4D pos[PARTICLES] __attribute__ ((aligned (16)));
vec4D vel[PARTICLES] __attribute__ ((aligned (16)));
vec4D force __attribute__ ((aligned (16)));
float inv_mass[PARTICLES] __attribute__ ((aligned (16)));
float dt __attribute__ ((aligned (16))) = 1.0f;

int main()
{
  int i;
  float time;
  float dt_inv_mass __attribute__ ((aligned (16)));
  vector float dt_v, dt_inv_mass_v;
  vector float *pos_v, *vel_v, force_v;
  vector float zero = (vector float){0.0f, 0.0f, 0.0f, 0.0f};

  pos_v = (vector float *)pos;
  vel_v = (vector float *)vel;
  force_v = *((vector float *)&force);

  // Replicate the variable time step across elements 0-2 of
  // a floating point vector. Force the last element (3) to zero.
  dt_v = vec_sld(vec_splat(vec_lde(0, &dt), 0), zero, 4);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += dt) {
    // For each particle
    for (i=0; i<PARTICLES; i++) {
      // Compute the new position and velocity as acted upon by the force f.
      pos_v[i] = vec_madd(vel_v[i], dt_v, pos_v[i]);

      dt_inv_mass = dt * inv_mass[i];
      dt_inv_mass_v = vec_splat(vec_lde(0, &dt_inv_mass), 0);

      vel_v[i] = vec_madd(dt_inv_mass_v, force_v, vel_v[i]);
    }
  }
  return (0);
}

Step 1b: : SIMDize in Structure-of-Arrays Form for Vector/SIMD Multimedia Extension

The following example shows how to SIMDize in the SOA form. As in Step 1a, the algorithm assumes that the number of particles is a multiple of 4.
#define END_OF_TIME     10
#define PARTICLES       100000

typedef struct {
  float x, y, z, w;  
} vec4D;

// Separate arrays for each component of the vector.
vector float pos_x[PARTICLES/4], pos_y[PARTICLES/4], pos_z[PARTICLES/4];
vector float vel_x[PARTICLES/4], vel_y[PARTICLES/4], vel_z[PARTICLES/4];
vec4D force __attribute__ ((aligned (16)));
float inv_mass[PARTICLES] __attribute__ ((aligned (16)));
float dt = 1.0f;

int main()
{
  int i;
  float time;
  float dt_inv_mass __attribute__ ((aligned (16)));
  vector float force_v, force_x, force_y, force_z;
  vector float dt_v, dt_inv_mass_v;

  // Create a replicated vector for each component of the force vector.
  force_v = *(vector float *)(&force);
  force_x = vec_splat(force_v, 0);
  force_y = vec_splat(force_v, 1);
  force_z = vec_splat(force_v, 2);

  // Replicate the variable time step across all elements.
  dt_v = vec_splat(vec_lde(0, &dt), 0);

  // For each step in time
  for (time=0; time<END_OF_TIME; time += dt) {
    // For each particle
    for (i=0; i<PARTICLES/4; i++) {
      // Compute the new position and velocity as acted upon by the force f.
      pos_x[i] = vec_madd(vel_x[i], dt_v, pos_x[i]);
      pos_y[i] = vec_madd(vel_y[i], dt_v, pos_y[i]);
      pos_z[i] = vec_madd(vel_z[i], dt_v, pos_z[i]);

      dt_inv_mass = dt * inv_mass[i];
      dt_inv_mass_v = vec_splat(vec_lde(0, &dt_inv_mass), 0);

      vel_x[i] = vec_madd(dt_inv_mass_v, force_x, vel_x[i]);
      vel_y[i] = vec_madd(dt_inv_mass_v, force_y, vel_y[i]);
      vel_z[i] = vec_madd(dt_inv_mass_v, force_z, vel_z[i]);
    }
  }
  return (0);
}