A compiler that automatically merges scalar data into a parallel-packed SIMD data structure is called an auto-vectorizing compiler. Such compilers must handle all the high-level language constructs, and therefore do not always produce optimal code.
/* Scalar version */ int mult(float *array1, float *array2, float *out, int arraySize) { int i; for (i = 0; i < arraySize; i++) { out[i] = array1[i] * array2[i]; } return 0; } /* Vectorized version */ int vmult(float *array1, float *array2, float *out, int arraySize) { /* This code assumes that the arrays are quadword-aligned. */ /* This code assumes that the arraySize is divisible by 4. */ int i, arraySizebyfour; arraySizebyfour = arraySize >> 2; /* arraySize/4 vectors */ vector float *varray1 = (vector float *) (array1); vector float *varray2 = (vector float *) (array2); vector float *vout = (vector float *) (out); for (i = 0; i < arraySizebyfour; i++) { /*spu_mul is an intrinsic that multiplies vectors */ vout[i] = spu_mul(varray1[i], varray2[i]); } return 0; }