NEON: Optimize code

Question

I am currently playing with ARM Neon and have the written the following functions, one in C, one with NEON Intrinsics to compare the speeds. The functions compare two arrays. The parameter cb is the number of bytes divided by 8:

inline uint32_t is_not_zero(uint32x4_t v)
{
        uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v));
        return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
}

uint32_t sum_neon(const uint8_t *s1, const uint8_t *s2, uint32_t cb)
{
        const uint32_t *s1_cmp = (uint32_t *)s1;
        const uint32_t *s2_cmp = (uint32_t *)s2;

        cb *= 2;

        while (cb--)
        {
                uint32x4x2_t cmp1 = vld2q_u32(s1_cmp);
                uint32x4x2_t cmp2 = vld2q_u32(s2_cmp);

                uint32x4_t res1 = vceqq_u32(cmp1.val[0], cmp2.val[0]);
                uint32x4_t res2 = vceqq_u32(cmp1.val[1], cmp2.val[1]);

                if (!is_not_zero(res1)) return 1;
                if (!is_not_zero(res2)) return 1;

                s1_cmp += 8;
                s2_cmp += 8;
        }
        return 0;
}

uint32_t sum_c(const uint8_t *s1, const uint8_t *s2, uint32_t cb)
{
    const uint64_t *p1 = (uint64_t *)s1;
    const uint64_t *p2 = (uint64_t *)s2;
    uint32_t n = 0;
    while (cb--) {
        if ((p1[n  ] != p2[n  ]) ||
                (p1[n+1] != p2[n+1]) ||
                (p1[n+2] != p2[n+2]) ||
                (p1[n+3] != p2[n+3])) return 1;
        ++n;
    }
    return 0; 
}

I dont understand why the C implementation is WAY faster than the NEON variant. The code is compiled on a raspberry pi using -O3 -mcpu=cortex-a7 -mfpu=neon-vfpv4 -mfloat-abi=hard as CFlags.

NEON: Optimize code

Answers (1)

Related Questions