A compiler that automatically merges scalar data into a parallel-packed SIMD data structure is called an auto-vectorizing compiler. Such compilers must handle all the high-level language constructs, and therefore do not always produce optimal code.
/* Scalar version */
int mult(float *array1, float *array2, float *out, int arraySize) {
int i;
for (i = 0; i < arraySize; i++) {
out[i] = array1[i] * array2[i];
}
return 0;
}
/* Vectorized version */
int vmult(float *array1, float *array2, float *out, int arraySize) {
/* This code assumes that the arrays are quadword-aligned. */
/* This code assumes that the arraySize is divisible by 4. */
int i, arraySizebyfour;
arraySizebyfour = arraySize >> 2; /* arraySize/4 vectors */
vector float *varray1 = (vector float *) (array1);
vector float *varray2 = (vector float *) (array2);
vector float *vout = (vector float *) (out);
for (i = 0; i < arraySizebyfour; i++) {
/*spu_mul is an intrinsic that multiplies vectors */
vout[i] = spu_mul(varray1[i], varray2[i]);
}
return 0;
}