How CUDA box filter works?

Question

I have this sample of code that I try to understand it:

    __global__ void
d_boxfilter_rgba_x(unsigned int *od, int w, int h, int r)
    {
    float scale = 1.0f / (float)((r << 1) + 1);
    unsigned int y = blockIdx.x*blockDim.x + threadIdx.x;

    if (y < h)
        {
        float4 t = make_float4(0.0f);

        for (int x = -r; x <= r; x++)
            {
            t += tex2D(rgbaTex, x, y);
            }

        od[y * w] = rgbaFloatToInt(t * scale);

        for (int x = 1; x < w; x++)
            {
            t += tex2D(rgbaTex, x + r, y);
            t -= tex2D(rgbaTex, x - r - 1, y);
            od[y * w + x] = rgbaFloatToInt(t * scale);
            }
        }
    }

__global__ void
d_boxfilter_rgba_y(unsigned int *id, unsigned int *od, int w, int h, int r)
    {
    unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
    id = &id[x];
    od = &od[x];

    float scale = 1.0f / (float)((r << 1) + 1);

    float4 t;
    // partea din stanga
    t = rgbaIntToFloat(id[0]) * r;

    for (int y = 0; y < (r + 1); y++)
        {
        t += rgbaIntToFloat(id[y*w]);
        }

    od[0] = rgbaFloatToInt(t * scale);

    for (int y = 1; y < (r + 1); y++)
        {
        t += rgbaIntToFloat(id[(y + r) * w]);
        t -= rgbaIntToFloat(id[0]);
        od[y * w] = rgbaFloatToInt(t * scale);
        }

    // main loop
    for (int y = (r + 1); y < (h - r); y++)
        {
        t += rgbaIntToFloat(id[(y + r) * w]);
        t -= rgbaIntToFloat(id[((y - r) * w) - w]);
        od[y * w] = rgbaFloatToInt(t * scale);
        }

    // right side
    for (int y = h - r; y < h; y++)
        {
        t += rgbaIntToFloat(id[(h - 1) * w]);
        t -= rgbaIntToFloat(id[((y - r) * w) - w]);
        od[y * w] = rgbaFloatToInt(t * scale);
        }
    }

This should be a box filter with CUDA. From what I have read this should make an average with a given radius. But in d_boxfilter_rgba_y make something like this:

od[0] = rgbaFloatToInt(t * scale);

I don't understand why is used this scale and why are made all that loops when there should be just one. To calculate the value from -r to +r and divide this by a number of pixels.

Can somebody help me?

OutOfBound · Accepted Answer

To calculate the average of a box with radius 1 (3 values), you do:

(box[0] + box[1] + box[2]) / 3 // which is equal to
(box[0] + box[1] + box[2] * 1/3 // which is equal to your scale factor

The calculation of scale is:

1.0f / (float)((r << 1) + 1); // equal to
1 / ((r * 2) + 1) // equal to
1 / (2r + 1) // 2r because you go to the left and right and +1 for the middle

The two for loops are used, because the "sliding window" optimisation is used. First the first box is calculated:

for (int x = -r; x <= r; x++)
{
    t += tex2D(rgbaTex, x, y);
}

And then for each step to the right, the value right of the box is added and the most left value of the box is removed. That way you can calculate the sum of the box with just 2 operations instead of 2*r + 1 operations.

for (int x = 1; x < w; x++)
{
    t += tex2D(rgbaTex, x + r, y);
    t -= tex2D(rgbaTex, x - r - 1, y);
    od[y * w + x] = rgbaFloatToInt(t * scale);
    }
}

How CUDA box filter works?

Answers (1)

Related Questions