Different result with SSE version

Question

I'm trying to rewrite some code to make use of SSE. However, for some reason my SSE version produces different results than the original, e.g. 209.1 instead of 1.47 etc..

Why? The entire function can be found here.

struct vec_ps
{
    __m128 value;

    vec_ps(){}  
    vec_ps(float value)         : value(_mm_set1_ps(value)) {}
    vec_ps(__m128 value)        : value(value)              {}
    vec_ps(const vec_ps& other) : value(other.value)        {}

    vec_ps& operator=(const vec_ps& other)
    {
        value = other.value;
        return *this;
    }

    vec_ps& operator+=(const vec_ps& other)
    {
        value = _mm_add_ps(value, other.value);
        return *this;
    }

    vec_ps& operator-=(const vec_ps& other)
    {
        value = _mm_sub_ps(value, other.value);
        return *this;
    }

    vec_ps& operator*=(const vec_ps& other)
    {
        value = _mm_mul_ps(value, other.value);
        return *this;
    }

    vec_ps& operator/=(const vec_ps& other)
    {
        value = _mm_div_ps(value, other.value);
        return *this;
    }

    static vec_ps load(float* ptr)
    {
        return vec_ps(_mm_load_ps(ptr));
    }

    static void stream(float* ptr, const vec_ps& other)
    {
        _mm_stream_ps(ptr, other.value);
    }

    void stream(float* ptr)
    {
        _mm_stream_ps(ptr, value);
    }
};

vec_ps operator+(const vec_ps& lhs, const vec_ps& rhs)
{       
    return vec_ps(lhs) += rhs;
}

vec_ps operator-(const vec_ps& lhs, const vec_ps& rhs)
{       
    return vec_ps(lhs) -= rhs;
}

vec_ps operator*(const vec_ps& lhs, const vec_ps& rhs)
{       
    return vec_ps(lhs) *= rhs;
}

vec_ps operator/(const vec_ps& lhs, const vec_ps& rhs)
{       
    return vec_ps(lhs) /= rhs;
}

void foo(/*...*/)
{   
        std::vector> ref_mu(w*h);
        std::vector> cmp_mu(w*h);
        std::vector> ref_sigma_sqd(w*h);
        std::vector> cmp_sigma_sqd(w*h);
        std::vector> sigma_both(w*h);
        int size    = w*h*sizeof(float);

        /*...*/

        float ssim_sum  = 0.0;
        float ssim_sum2 = 0.0;

        vec_ps ssim_sum_ps(0.0f);       

        for(int n = 0; n < size/16; ++n)
        {
            auto ref_mu_ps          = vec_ps::load(ref_mu.data()        + n*4);
            auto cmp_mu_ps          = vec_ps::load(cmp_mu.data()        + n*4);
            auto sigma_both_ps      = vec_ps::load(sigma_both.data()    + n*4);
            auto ref_sigma_sqd_ps   = vec_ps::load(ref_sigma_sqd.data() + n*4);
            auto cmp_sigma_sqd_ps   = vec_ps::load(cmp_sigma_sqd.data() + n*4);

            auto numerator   = (2.0f * ref_mu_ps * cmp_mu_ps + C1) * (2.0f * sigma_both_ps + C2);
            auto denominator = (ref_mu_ps*ref_mu_ps + cmp_mu_ps*cmp_mu_ps + C1) * (ref_sigma_sqd_ps + cmp_sigma_sqd_ps + C2);
            ssim_sum_ps += numerator / denominator; 
        }

        for(int n = 0; n < 4; ++n)
            ssim_sum2 += ssim_sum_ps.value.m128_f32[n];

        for (int y = 0; y < h; ++y)
        {
            int offset = y*w;
            for (int x = 0; x < w; ++x, ++offset) 
            {           
                float numerator   = (2.0f * ref_mu[offset] * cmp_mu[offset] + C1) * (2.0f * sigma_both[offset] + C2);
                float denominator = (ref_mu[offset]*ref_mu[offset] + cmp_mu[offset]*cmp_mu[offset] + C1) * (ref_sigma_sqd[offset] + cmp_sigma_sqd[offset] + C2);
                ssim_sum += numerator / denominator;                
            }
        }
        assert(ssim_sum2 == ssim_sum); // FAILS!
}

Dietmar K&#252;hl · Accepted Answer

Just the comment above as it seems to be the answer to the question: Is there any guarantee that w * h is divisible by four? If that isn't the case your last iteration in the SSE version will be based on random numbers. The use of sizeof(float) in one place and 16 instead of 4 * sizeof(float)` in another is somewhat confusing: why not leave off the size the of the float? Also, why doesn't the non-SSE version just run over the area instead of trying to follow the width and the height of the matrix?

Different result with SSE version

Answers (1)

Related Questions