Finley
Finley

Reputation: 845

Understand efficient implementation of median filter with SSE2 / SSSE3 Instruction Set

I'm recently stuck on understanding some legacy codes about median filter, it is totally implemented by SSE2 Instruction Set. My task is to improve its efficiency based on the legacy codes, which are as follows:

/* sort pixel to acquire median value quickly. */
inline void sortSwap(__m128& a, __m128& b)
{
    __m128 temp = a;
    a = _mm_min_ps(a, b);
    b = _mm_max_ps(temp, b);
}

void medianDispOpt(float* dispImg, float* destDisp, const uint16_t width, const uint16_t height)
{
    float* dispImgTemp = destDisp;

    float* line1 = dispImg;
    float* line2 = dispImg + width;
    float* line3 = dispImg + 2 * width;

    float* end = dispImg + width*height;

    destDisp += width;
    __m128 lastMedian = _mm_setzero_ps();

    do {
        const __m128 line1_reg = _mm_load_ps(line1);
        const __m128 line1_reg_next = _mm_load_ps(line1 + 4);
        __m128 store0 = line1_reg;
        __m128 store1 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(line1_reg_next), _mm_castps_si128(line1_reg), 4));
        __m128 store2 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(line1_reg_next), _mm_castps_si128(line1_reg), 8));

        const __m128 line2_reg = _mm_load_ps(line2);
        const __m128 line2_reg_next = _mm_load_ps(line2 + 4);
        __m128 store3 = line2_reg;
        __m128 store4 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(line2_reg_next), _mm_castps_si128(line2_reg), 4));
        __m128 store5 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(line2_reg_next), _mm_castps_si128(line2_reg), 8));

        const __m128 line3_reg = _mm_load_ps(line3);
        const __m128 line3_reg_next = _mm_load_ps(line3 + 4);
        __m128 store6 = line3_reg;
        __m128 store7 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(line3_reg_next), _mm_castps_si128(line3_reg), 4));
        __m128 store8 = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(line3_reg_next), _mm_castps_si128(line3_reg), 8));

        // find median
        sortSwap(store1, store2); 
        sortSwap(store4, store5); 
        sortSwap(store7, store8);
        sortSwap(store0, store1); 
        sortSwap(store3, store4); 
        sortSwap(store6, store7);
        sortSwap(store1, store2); 
        sortSwap(store4, store5); 
        sortSwap(store7, store8);
        sortSwap(store0, store3); 
        sortSwap(store5, store8); 
        sortSwap(store4, store7);       
        sortSwap(store3, store6); 
        sortSwap(store1, store4); 
        sortSwap(store2, store5);       
        sortSwap(store4, store7); 
        sortSwap(store4, store2); 
        sortSwap(store6, store4);
        sortSwap(store4, store2);

        const __m128i c = _mm_alignr_epi8(_mm_castps_si128(store4), _mm_castps_si128(lastMedian), 12);
        _mm_store_si128((__m128i*)destDisp, c);
        lastMedian = store4;

        destDisp += 4; line1 += 4; line2 += 4; line3 += 4;

    } while (line3 + 4 + 4 <= end);

    memcpy(dispImgTemp, dispImg, sizeof(float)*(width + 1));
    memcpy(dispImgTemp + width*height - width - 1 - 3, dispImg + width*height - width - 1 - 3, sizeof(float)*(width + 1 + 3));
}

the input image dispImg is of width * height with float pixel, the image data array is arranged by the image row, and each row is compactly laid(i.e. no redundant pixels between rows).
Now, suppose the line1_reg,line1_reg_next,line2_reg,line2_reg_next,line3_reg,line3_reg_next of first three rows are briefly represented by following numbers:

line1_reg= {0 1 2 3}, line1_reg_next = {4 5 6 7}
line2_reg= {8 9 10 11}, line2_reg_next = {12 13 14 15}
line3_reg= {16 17 18 19}, line2_reg_next = {20 21 22 23}

so storens are:

store0 = {0,1,2,3}, store1={1,2,3,4},store2={2,3,4,5}
store3 = {8,9,10,11}, store4={9,10,11,12},store5={10,11,12,13}
store6 = {16,17,18,19}, store7={17,18,19,20},store8={18,19,20,21}

After the first three sortSwaps, changed storens are as follows:

store1 = {min(1,2),min(2,3),min(3,4),min(4,5)}
store2 = {max(1,2),max(2,3),max(3,4),max(4,5)}
store4 = {min(9,10),min(10,11),min(11,12),min(12,13)}
store5 = {max(9,10),max(10,11),max(11,12),max(12,13)}
store7 = {min(17,18),min(18,19),min(19,20),min(20,21)}
store8 = {max(17,18),max(18,19),max(19,20),max(20,21)}

Then after the second three sortSwaps, changed storen are as follows:

store0 = {min(0,1,2),min(1,2,3),min(2,3,4),min(3,4,5)}
store1 = {max(0,1,2),max(1,2,3),max(2,3,4),max(3,4,5)}
store3 = {min(8,9,10),min(9,10,11),min(10,11,12),min(11,12,13)}
store4 = {max(8,9,10),max(9,10,11),max(10,11,12),max(11,12,13)}
store6 = {min(16,17,18),min(17,18,19),min(18,19,20),min(19,20,21)}
store7 = {max(16,17,18),max(17,18,19),max(18,19,20),max(19,20,21)}

Next, after the third three sortSwaps, changed storen are as follows:

store1 = {max(1,2),max(2,3),max(3,4),max(4,5)}
store2 = {max(0,1,2),max(1,2,3),max(2,3,4),max(3,4,5)}
store4 = {max(9,10),max(10,11),max(11,12),max(12,13)}
store5 = {max(8,9,10),max(9,10,11),max(10,11,12),max(11,12,13)}
store7 = {max(17,18),max(18,19),max(19,20),max(20,21)}
store8 = {max(16,17,18),max(17,18,19),max(18,19,20),max(19,20,21)}

... Now start the sixth three sortSwaps, changed storen are as follows:

store4[0] = median(max(1,2),max(9,10),max(17,18))
store4[1] = median(max(2,3),max(10,11),max(18,19))
store4[2] = median(max(3,4),max(11,12),max(19,20))
store4[3] = median(max(4,5),max(12,13),max(20,21))
...

Above is my stiff thread, I cannot continue my inference on calculating the sixth three sortSwaps. So, can anyone give me a clearer interpretation of above codes?

Upvotes: 0

Views: 438

Answers (0)

Related Questions