Simple SSE loop slower than non-SSE version

Question

I am trying to compare SSE float[4] addition to standard float[4] addition. As a demo I compute the sum of the summed components, with and without SSE:

#include 
#include 

struct Point4
{
  Point4()
  {
    data[0] = 0;
    data[1] = 0;
    data[2] = 0;
    data[3] = 0;
  }

  float data[4];
};

void Standard()
{
  Point4 a;
  a.data[0] = 1.0f;
  a.data[1] = 2.0f;
  a.data[2] = 3.0f;
  a.data[3] = 4.0f;

  Point4 b;
  b.data[0] = 1.0f;
  b.data[1] = 6.0f;
  b.data[2] = 3.0f;
  b.data[3] = 5.0f;

  float total = 0.0f;
  for(unsigned int i = 0; i < 1e9; ++i)
  {
    for(unsigned int component = 0; component < 4; ++component)
    {
      total += a.data[component] + b.data[component];
    }
  }

  std::cout << "total: " << total << std::endl;
}

void Vectorized()
{
  typedef float v4sf __attribute__ (( vector_size(4*sizeof(float)) ));

  v4sf a;
  float* aPointer = (float*)&a;
  aPointer[0] = 1.0f; aPointer[1] = 2.0f; aPointer[2] = 3.0f; aPointer[3] = 4.0f;

  v4sf b;
  float* bPointer = (float*)&b;
  bPointer[0] = 1.0f; bPointer[1] = 6.0f; bPointer[2] = 3.0f; bPointer[3] = 5.0f;

  v4sf result;
  float* resultPointer = (float*)&result;
  resultPointer[0] = 0.0f;
  resultPointer[1] = 0.0f;
  resultPointer[2] = 0.0f;
  resultPointer[3] = 0.0f;

  for(unsigned int i = 0; i < 1e9; ++i)
  {
    result += a + b; // Vectorized operation
  }

  // Sum the components of the result (this is done with the "total += " in the Standard() loop
  float total = 0.0f;
  for(unsigned int component = 0; component < 4; ++component)
  {
    total += resultPointer[component];
  }
  std::cout << "total: " << total << std::endl;
}

int main()
{

//  Standard();

  Vectorized();

  return 0;
}

However, the code seems to be faster (~.2 seconds) with the standard method than with the vectorized (~.4 seconds) method. Is it because of the for loop to sum the v4sf values? Is there a better operation I can use to time the difference between these two techniques and still compare the output to make sure there were no differences between the two?

Simple SSE loop slower than non-SSE version

Answers (1)

Related Questions