Example: array-summing

This example illustrates array-summing using a function that sums an input array of 16-byte values.

The following code contains three versions of a function that sums an input array of 16-byte values. For this kind of array-summing function, you have several options:
The first option performs 16 iterations of the loop. The second option performs only four iterations of the loop but with four additions in each iteration. The third option uses Vector/SIMD Multimedia Extension intrinsics to eliminate the loop entirely.
// 16 iterations of a loop
int rolled_sum(unsigned char bytes[16])
{
	int i;
	int sum = 0;
	for (i = 0; i < 16; ++i) {
		sum += bytes[i];
	}
	return sum;
}


// 4 iterations of a loop, with 4 additions in each iteration
int unrolled_sum(unsigned char bytes[16])
{
	int i;
	int sum[4] = {0, 0, 0, 0};
	for (i = 0; i < 16; i += 4) {
		sum[0] += bytes[i + 0];
		sum[1] += bytes[i + 1];
		sum[2] += bytes[i + 2];
		sum[3] += bytes[i + 3];
	}
	return sum[0] + sum[1] + sum[2] + sum[3];
} 

// Vectorized for  Vector/SIMD Multimedia Extension
int vectorized_sum(unsigned char bytes[16]) 
{ 
  vector unsigned char vbytes; 
  union { 
    int i[4]; 
    vector signed int v; 
  } sum; 
  vector unsigned int zero = (vector unsigned int){0}; 

  // Perform a misaligned vector load of the 16 bytes. 
  vbytes = vec_perm(vec_ld(0, bytes), vec_ld(16, bytes), vec_lvsl(0, bytes)); 

  // Sum the 16 bytes of the vector 
  sum.v = vec_sums((vector signed int)vec_sum4s(vbytes, zero), 
    (vector signed int)zero); 

  // Extract the sum and return the result. 
  return (sum.i[3]); 
}