Makogan
Makogan

Reputation: 9540

Optimizing a raytracing shader in GLSL

I have coded a voxelization based raytracer which is working as expected but is very slow.

Currently the raytracer code is as follows:

#version 430 
//normalized positon from (-1, -1) to (1, 1)
in vec2 f_coord;

out vec4 fragment_color;

struct Voxel
{
    vec4 position;
    vec4 normal;
    vec4 color;
};

struct Node
{
    //children of the current node
    int children[8];
};

layout(std430, binding = 0) buffer voxel_buffer
{
    //last layer of the tree, the leafs
    Voxel voxels[];
};
layout(std430, binding = 1) buffer buffer_index
{
    uint index;
};
layout(std430, binding = 2) buffer tree_buffer
{
    //tree structure       
    Node tree[];
};
layout(std430, binding = 3) buffer tree_index
{
    uint t_index;
};

uniform vec3 camera_pos; //position of the camera
uniform float aspect_ratio; // aspect ratio of the window
uniform float cube_dim; //Dimenions of the voxelization cube
uniform int voxel_resolution; //Side length of the cube in voxels

#define EPSILON 0.01
// Detect whether a position is inside of the voxel with size size located at corner
bool inBoxBounds(vec3 corner, float size, vec3 position)
{
    bool inside = true;
    position-=corner;//coordinate of the position relative to the box coordinate system
    //Test that all coordinates are inside the box, if any is outisde, the point is out the box
    for(int i=0; i<3; i++)
    {
        inside = inside && (position[i] > -EPSILON);
        inside = inside && (position[i] < size+EPSILON);
    }

    return inside;
}

//Get the distance to a box or infinity if the box cannot be hit
float boxIntersection(vec3 origin, vec3 dir, vec3 corner0, float size)
{
    dir = normalize(dir);
    vec3 corner1 = corner0 + vec3(size,size,size);//Oposite corner of the box

    float coeffs[6];
    //Calculate the intersaction coefficients with te 6 bonding planes 
    coeffs[0] = (corner0.x - origin.x)/(dir.x);
    coeffs[1] = (corner0.y - origin.y)/(dir.y);
    coeffs[2] = (corner0.z - origin.z)/(dir.z);

    coeffs[3] = (corner1.x - origin.x)/(dir.x);
    coeffs[4] = (corner1.y - origin.y)/(dir.y);
    coeffs[5] = (corner1.z - origin.z)/(dir.z);
    //by default the distance to the box is infinity
    float t = 1.f/0.f;

    for(uint i=0; i<6; i++){
        //if the distance to a boxis negative, we set it to infinity as we cannot travel in the negative direction
        coeffs[i] = coeffs[i] < 0 ? 1.f/0.f : coeffs[i];
        //The distance is the minumum of the previous calculated distance and the current distance
        t = inBoxBounds(corner0,size,origin+dir*coeffs[i]) ? min(coeffs[i],t) : t;
    }

    return t;
}

#define MAX_TREE_HEIGHT 11
int nodes[MAX_TREE_HEIGHT];
int levels[MAX_TREE_HEIGHT];
vec3 positions[MAX_TREE_HEIGHT];
int sp=0;

void push(int node, int level, vec3 corner)
{
    nodes[sp] = node;
    levels[sp] = level;
    positions[sp] = corner;
    sp++;
}

void main()
{   
    int count = 0; //count the iterations of the algorithm
    vec3 r = vec3(f_coord.x, f_coord.y, 1.f/tan(radians(40))); //direction of the ray
    r.y/=aspect_ratio; //modify the direction based on the windows aspect ratio
    vec3 dir = r;
    r += vec3(0,0,-1.f/tan(radians(40))) + camera_pos; //put the ray at the camera position

    fragment_color = vec4(0);
    int max_level = int(log2(voxel_resolution));//height of the tree
    push(0,0,vec3(-cube_dim));//set the stack
    float tc = 1.f; //initial color value, to be decreased whenever a voxel is hit
    //tree variables
    int level=0;
    int node=0;
    vec3 corner;

    do
    {
        //pop from stack
        sp--;
        node = nodes[sp];
        level = levels[sp];
        corner = positions[sp];

        //set the size of the current voxel 
        float size = cube_dim / pow(2,level);
        //set the corners of the children
        vec3 corners[] =
            {corner,                        corner+vec3(0,0,size),
            corner+vec3(0, size,0),         corner+vec3(0,size,size),
            corner+vec3(size,0,0),          corner+vec3(size,0,size),
            corner+vec3(size,size,0),       corner+vec3(size,size,size)};

        float coeffs[8];
        for(int child=0; child<8; child++)
        {
            //Test non zero childs, zero childs are empty and thus should be discarded
            coeffs[child] = tree[node].children[child]>0?
                //Get the distance to your child if it's not empty or infinity if it's empty
                boxIntersection(r, dir, corners[child], size) : 1.f/0.f;
        }
        int indices[8] = {0,1,2,3,4,5,6,7};
        //sort the children from closest to farthest
        for(uint i=0; i<8; i++)
        {
            for(uint j=i; j<8; j++)
            {
                if((coeffs[j] < coeffs[i]))
                {
                    float swap = coeffs[i];
                    coeffs[i] = coeffs[j];
                    coeffs[j] = swap;

                    int iSwap = indices[i];
                    indices[i] = indices[j];
                    indices[j] = iSwap;

                    vec3 vSwap = corners[i];
                    corners[i] = corners[j];
                    corners[j] = vSwap;
                }
            }
        }
        //push to stack
        for(uint i=7; i>=0; i--)
        {
            if(!isinf(coeffs[i]))
            {
                push(tree[node].children[indices[i]],
                    level+1, corners[i]);
            }
        }
        count++;
    }while(level < (max_level-1) && sp>0);
    //set color
    fragment_color = vec4(count)/100;
}

As it may not be fully clear what this does, let me explain.

We check ray-box intersections starting with a big cube. If we hit it we test intersection with the 8 cubes that compose it.

If we hit any fo those we check intersections with the 8 cubes that make up that cube.

In 2D this would look as follows:

enter image description here

In this case we have 4 layers, we check the big box first, then the ones colored in red, then the ones colored in green, and finally the ones colored in blue.

Printing out the number of times the raytracing step executed as a color (which is what the code snippet I have provided does)

results in the following image:

enter image description here

As you can see, most of the time the shader doesn't do more than 100 iterations.

However this shader takes 200 000 microseconds to execute on average in a gtx 1070.

Since the issue is not number of executions, my problem is likely to be on thread execution.

Does anyone know how I could optimize this code? The biggest botttleneck seems to be the use of a stack.

If I run the same code without pushing to the stack (generating wrong output), there is a 10 fold improvement in runtime

Upvotes: 5

Views: 2016

Answers (3)

Darklighter
Darklighter

Reputation: 2192

Thread execution on a GPU may be massively parallel, but that doesn’t mean that all threads run independently from one another. Groups of threads execute exactly the same instructions, the only difference is the input data. That means that branches and therefore loops can’t make a thread diverge in execution and therefore also not let them terminate early.

Your example shows the most extreme edge case of this: when there is a high likelyhood that in a group of threads all work that’s done is relevant to one thread only.

To alleviate this, you should try to reduce the difference in execution length (iterations in your case) for threads in a group (or in total). This can be done by setting a limit on the number of iterations per shader pass and rescheduling only those threads/pixels that need more iterations.

Upvotes: 1

bernie
bernie

Reputation: 10380

The first thing that stands out is your box intersection function. Have a look at inigo quilez' procedural box function for a much faster version. Since your boxsize is uniform in all axes and you don't need outNormal, you can get an even lighter version. In essence, use maths instead of the brute force approach that tests each box plane.

Also, try to avoid temporary storage where possible. For example, the corners array could be computed on demand for each octree box. Of course, with the above suggestion, these will be changed to box centers.

Since nodes, levels and positions are always accessed together, try co-locating them in a new single struct and access them as a single unit.

Will look more later...

Upvotes: 3

Ripi2
Ripi2

Reputation: 7198

It seems you test for intersection with the ray most of all voxels in each level of the octree. And sort them (by some distance) also in each level. I propose another approach.

If the ray intersects with the bounding box (level 0 of the octree) it makes it at two faces of the box. Or in a corner or an edge, these are "corner" cases.

Finding the 3D ray-plane intersection can be done like here. Finding if the intersection is inside the face (quad) can be done by testing if the point is inside of one of the two triangles of the face, like here.

Get the farthest intersection I0 from the camera. Also let r be a unit vector of the ray in the direction I0 toward the camera.

Find the deepest voxel for I0 coordinates. This is the farthest voxel from the camera.

Now we want the exit-coordinates I0e for the ray in that voxel, through another face. While you could do again the calculations for all 6 faces, if your voxels are X,Y,X aligned and you define the ray in the same coordinates system as the octree, then calculae simplify a lot.

Apply a little displacement (e.g. a 1/1000 of the smallest voxel size) to I0e by the r unit vector of the ray: I1 = I0e + r/1000. Find the voxel to these I1. This is the next voxel in the sorted list of voxel-ray intersections.

Repeat finding I1e then I2 then I2e then I3 etc. until the bounding box is exited. The list of crossed voxels is sorted.

Working with the octree can be optimized depending on how you store its info: All possible nodes or just used. Nodes with data or just "pointers" to another container with the data. This is matter for another question.

Upvotes: 2

Related Questions