Problem with making radix sort __uint128_t array

Question

I am trying to convert this fast radix sort code to work with __uint128_t's.

typedef union {
    struct {
        uint32_t c8[256];
        uint32_t c7[256];
        uint32_t c6[256];
        uint32_t c5[256];
        uint32_t c4[256];
        uint32_t c3[256];
        uint32_t c2[256];
        uint32_t c1[256];
    };
    uint32_t counts[256 * 8];
} rscounts_t;

uint64_t * radixSort(uint64_t * array, uint32_t size) {
    rscounts_t counts;
    memset(&counts, 0, 256 * 8 * sizeof(uint32_t));
    uint64_t * cpy = (uint64_t *)malloc(size * sizeof(uint64_t));
    uint32_t o8=0, o7=0, o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
    uint32_t t8, t7, t6, t5, t4, t3, t2, t1;
    uint32_t x;
    // calculate counts
    for(x = 0; x < size; x++) {
        t8 = array[x] & 0xff;
        t7 = (array[x] >> 8) & 0xff;
        t6 = (array[x] >> 16) & 0xff;
        t5 = (array[x] >> 24) & 0xff;
        t4 = (array[x] >> 32) & 0xff;
        t3 = (array[x] >> 40) & 0xff;
        t2 = (array[x] >> 48) & 0xff;
        t1 = (array[x] >> 56) & 0xff;
        counts.c8[t8]++;
        counts.c7[t7]++;
        counts.c6[t6]++;
        counts.c5[t5]++;
        counts.c4[t4]++;
        counts.c3[t3]++;
        counts.c2[t2]++;
        counts.c1[t1]++;
    }
    // convert counts to offsets
    for(x = 0; x < 256; x++) {
        t8 = o8 + counts.c8[x];
        t7 = o7 + counts.c7[x];
        t6 = o6 + counts.c6[x];
        t5 = o5 + counts.c5[x];
        t4 = o4 + counts.c4[x];
        t3 = o3 + counts.c3[x];
        t2 = o2 + counts.c2[x];
        t1 = o1 + counts.c1[x];
        counts.c8[x] = o8;
        counts.c7[x] = o7;
        counts.c6[x] = o6;
        counts.c5[x] = o5;
        counts.c4[x] = o4;
        counts.c3[x] = o3;
        counts.c2[x] = o2;
        counts.c1[x] = o1;
        o8 = t8; 
        o7 = t7; 
        o6 = t6; 
        o5 = t5; 
        o4 = t4; 
        o3 = t3; 
        o2 = t2; 
        o1 = t1;
    }
    // radix
    for(x = 0; x < size; x++) {
        t8 = array[x] & 0xff;
        cpy[counts.c8[t8]] = array[x];
        counts.c8[t8]++;
    }
    for(x = 0; x < size; x++) {
        t7 = (cpy[x] >> 8) & 0xff;
        array[counts.c7[t7]] = cpy[x];
        counts.c7[t7]++;
    }
    for(x = 0; x < size; x++) {
        t6 = (array[x] >> 16) & 0xff;
        cpy[counts.c6[t6]] = array[x];
        counts.c6[t6]++;
    }
    for(x = 0; x < size; x++) {
        t5 = (cpy[x] >> 24) & 0xff;
        array[counts.c5[t5]] = cpy[x];
        counts.c5[t5]++;
    }
    for(x = 0; x < size; x++) {
        t4 = (array[x] >> 32) & 0xff;
        cpy[counts.c4[t4]] = array[x];
        counts.c4[t4]++;
    }
    for(x = 0; x < size; x++) {
        t3 = (cpy[x] >> 40) & 0xff;
        array[counts.c3[t3]] = cpy[x];
        counts.c3[t3]++;
    }
    for(x = 0; x < size; x++) {
        t2 = (array[x] >> 48) & 0xff;
        cpy[counts.c2[t2]] = array[x];
        counts.c2[t2]++;
    }
    for(x = 0; x < size; x++) {
        t1 = (cpy[x] >> 56) & 0xff;
        array[counts.c1[t1]] = cpy[x];
        counts.c1[t1]++;
    }
    free(cpy);
    return array;
}

I blindly followed the pattern and doubled the number of variables making t16...t1 and o16...o1 and counts.c16 to counts.c1 but it does not give the right answer. It sorts uints less that 2^64 but does not sort larger ints correctly. I was wondering if >> is the problem. Should I expect it to work for __uint128's in gcc? Can anyone make a version that does work with __int128_t's. I would like to sort the array as quickly as possible.

My non-working code:

typedef union {
    struct {
        uint32_t c16[256];
        uint32_t c15[256];
        uint32_t c14[256];
        uint32_t c13[256];
        uint32_t c12[256];
        uint32_t c11[256];
        uint32_t c10[256];
        uint32_t c9[256];
        uint32_t c8[256];
        uint32_t c7[256];
        uint32_t c6[256];
        uint32_t c5[256];
        uint32_t c4[256];
        uint32_t c3[256];
        uint32_t c2[256];
        uint32_t c1[256];
    };
    uint32_t counts[256 * 16];
} rscounts_t;

__uint128_t * radixSort(__uint128_t * array, uint32_t size) {
    rscounts_t counts;
    memset(&counts, 0, 256 * 16 * sizeof(uint32_t));
    __uint128_t * cpy = (__uint128_t *)malloc(size * sizeof(__uint128_t));
    uint32_t o16=0, o15=0, o14=0, o13=0, o12=0, o11=0, o10=0, o9=0, o8=0, o7=0, o6=0, o5=0, o4=0, o3=0, o2=0, o1=0;
    uint32_t t16, t15, t14, t13, t12, t11, t10, t9, t8, t7, t6, t5, t4, t3, t2, t1;
    uint32_t x;
    // calculate counts
    for(x = 0; x < size; x++) {
        t16 = array[x] & 0xff;
        t15 = (array[x] >> 8) & 0xff;
        t14 = (array[x] >> 16) & 0xff;
        t13 = (array[x] >> 24) & 0xff;
        t12 = (array[x] >> 32) & 0xff;
        t11 = (array[x] >> 40) & 0xff;
        t10 = (array[x] >> 48) & 0xff;
        t9 = (array[x] >> 56) & 0xff;
        t8 = (array[x] >> 64) & 0xff;
        t7 = (array[x] >> 72) & 0xff;
        t6 = (array[x] >> 80) & 0xff;
        t5 = (array[x] >> 88) & 0xff;
        t4 = (array[x] >> 96) & 0xff;
        t3 = (array[x] >> 104) & 0xff;
        t2 = (array[x] >> 112) & 0xff;
        t1 = (array[x] >> 120) & 0xff;
        counts.c16[t16]++;
        counts.c15[t15]++;
        counts.c14[t14]++;
        counts.c13[t13]++;
        counts.c12[t12]++;
        counts.c11[t11]++;
        counts.c10[t10]++;
        counts.c9[t9]++;
        counts.c8[t8]++;
        counts.c7[t7]++;
        counts.c6[t6]++;
        counts.c5[t5]++;
        counts.c4[t4]++;
        counts.c3[t3]++;
        counts.c2[t2]++;
        counts.c1[t1]++;
    }
    // convert counts to offsets
    for(x = 0; x < 256; x++) {
        t16 = o16 + counts.c16[x];
        t15 = o15 + counts.c15[x];
        t14 = o14 + counts.c14[x];
        t13 = o13 + counts.c13[x];
        t12 = o12 + counts.c12[x];
        t11 = o11 + counts.c11[x];
        t10 = o10 + counts.c10[x];
        t9 = o9 + counts.c9[x];
        t8 = o8 + counts.c8[x];
        t7 = o7 + counts.c7[x];
        t6 = o6 + counts.c6[x];
        t5 = o5 + counts.c5[x];
        t4 = o4 + counts.c4[x];
        t3 = o3 + counts.c3[x];
        t2 = o2 + counts.c2[x];
        t1 = o1 + counts.c1[x];
    counts.c16[x] = o16;
        counts.c15[x] = o15;
        counts.c14[x] = o14;
        counts.c13[x] = o13;
        counts.c12[x] = o12;
        counts.c11[x] = o11;
        counts.c10[x] = o10;
        counts.c9[x] = o9;
        counts.c8[x] = o8;
        counts.c7[x] = o7;
        counts.c6[x] = o6;
        counts.c5[x] = o5;
        counts.c4[x] = o4;
        counts.c3[x] = o3;
        counts.c2[x] = o2;
        counts.c1[x] = o1;
    o16 = t16; 
        o15 = t15; 
        o14 = t14; 
        o13 = t13; 
        o12 = t12; 
        o11 = t11; 
        o10 = t10; 
        o9 = t9;
        o8 = t8; 
        o7 = t7; 
        o6 = t6; 
        o5 = t5; 
        o4 = t4; 
        o3 = t3; 
        o2 = t2; 
        o1 = t1;
    }
    // radix
    
    for(x = 0; x < size; x++) {
        t16 = array[x] & 0xff;
        cpy[counts.c16[t16]] = array[x];
        counts.c16[t16]++;
    }
    for(x = 0; x < size; x++) {
        t15 = (cpy[x] >> 8) & 0xff;
        array[counts.c15[t15]] = cpy[x];
        counts.c15[t15]++;
    }
    for(x = 0; x < size; x++) {
        t14 = (array[x] >> 16) & 0xff;
        cpy[counts.c14[t14]] = array[x];
        counts.c14[t14]++;
    }
    for(x = 0; x < size; x++) {
        t13 = (cpy[x] >> 24) & 0xff;
        array[counts.c13[t13]] = cpy[x];
        counts.c13[t13]++;
    }
    for(x = 0; x < size; x++) {
        t12 = (array[x] >> 32) & 0xff;
        cpy[counts.c12[t12]] = array[x];
        counts.c12[t12]++;
    }
    for(x = 0; x < size; x++) {
        t11 = (cpy[x] >> 40) & 0xff;
        array[counts.c11[t11]] = cpy[x];
        counts.c11[t11]++;
    }
    for(x = 0; x < size; x++) {
        t10 = (array[x] >> 48) & 0xff;
        cpy[counts.c10[t10]] = array[x];
        counts.c10[t10]++;
    }
    for(x = 0; x < size; x++) {
        t9 = (cpy[x] >> 56) & 0xff;
        array[counts.c9[t9]] = cpy[x];
        counts.c9[t9]++;
    }
    for(x = 0; x < size; x++) {
        t8 = (cpy[x] >> 64) & 0xff;
       cpy[counts.c8[t8]] =array[x];
        counts.c8[t8]++;
    }
    for(x = 0; x < size; x++) {
        t7 = (cpy[x] >> 72) & 0xff;
        array[counts.c7[t7]] = cpy[x];
        counts.c7[t7]++;
    }
    for(x = 0; x < size; x++) {
        t6 = (array[x] >> 80) & 0xff;
        cpy[counts.c6[t6]] = array[x];
        counts.c6[t6]++;
    }
    for(x = 0; x < size; x++) {
        t5 = (cpy[x] >> 88) & 0xff;
        array[counts.c5[t5]] = cpy[x];
        counts.c5[t5]++;
    }
    for(x = 0; x < size; x++) {
        t4 = (array[x] >> 96) & 0xff;
        cpy[counts.c4[t4]] = array[x];
        counts.c4[t4]++;
    }
    for(x = 0; x < size; x++) {
        t3 = (cpy[x] >> 104) & 0xff;
        array[counts.c3[t3]] = cpy[x];
        counts.c3[t3]++;
    }
    for(x = 0; x < size; x++) {
        t2 = (array[x] >> 112) & 0xff;
        cpy[counts.c2[t2]] = array[x];
        counts.c2[t2]++;
    }
    for(x = 0; x < size; x++) {
        t1 = (cpy[x] >> 120) & 0xff;
        array[counts.c1[t1]] = cpy[x];
        counts.c1[t1]++;
    }

    free(cpy);
    return array;
}

Bubbler · Accepted Answer

The t8/c8 block you just edited actually had more bug.

    //...
    for(x = 0; x < size; x++) {
        t10 = (array[x] >> 48) & 0xff;
        cpy[counts.c10[t10]] = array[x];
        counts.c10[t10]++;
    }
    for(x = 0; x < size; x++) {
        t9 = (cpy[x] >> 56) & 0xff;
        array[counts.c9[t9]] = cpy[x];
        counts.c9[t9]++;
    }
    for(x = 0; x < size; x++) {
        t8 = (cpy[x] >> 64) & 0xff; //<== See?
       cpy[counts.c8[t8]] =array[x];
        counts.c8[t8]++;
    }
    for(x = 0; x < size; x++) {
        t7 = (cpy[x] >> 72) & 0xff;
        array[counts.c7[t7]] = cpy[x];
        counts.c7[t7]++;
    }
    for(x = 0; x < size; x++) {
        t6 = (array[x] >> 80) & 0xff;
        cpy[counts.c6[t6]] = array[x];
        counts.c6[t6]++;
    }
    //...

Each block alternates the roles between array and cpy, one being the array to read the values and the other to write them in sorted order in that radix. Your t8 block breaks the pattern. It should be changed as follows:

    for(x = 0; x < size; x++) {
        t8 = (array[x] >> 64) & 0xff;
        cpy[counts.c8[t8]] = array[x];
        counts.c8[t8]++;
    }

Nate Eldredge raised a very good point in this comment.

Btw, it seems like this code would be far better if written with arrays and loops, instead of having 16 of every variable and hoping you don't make any typos when cutting and pasting and changing constants from line to line.

So I quickly wrote a version using loops: (typedef, naming, and formatting are just my preference)

typedef __uint128_t u128;
typedef uint32_t u32;
u128* radix_sort(u128* array, u32 size) {
    u32 counts[16][256];
    memset(&counts, 0, 256 * 16 * sizeof(u32));
    u128* cpy = (u128*) malloc(size * sizeof(u128));
    u32 o[16] = {0};
    u32 t[16];
    u32 x, pos;
    u128* array_from, * array_to;
    for (x = 0; x < size; x++) {
        for (pos = 0; pos < 16; pos++) {
            t[pos] = (array[x] >> 8 * pos) & 0xff;
            counts[pos][t[pos]]++;
        }
    }
    for (x = 0; x < 256; x++) {
        for (pos = 0; pos < 16; pos++) {
            t[pos] = o[pos] + counts[pos][x];
            counts[pos][x] = o[pos];
            o[pos] = t[pos];
        }
    }
    for (pos = 0; pos < 16; pos++) {
        array_from = pos % 2 == 0 ? array : cpy;
        array_to = pos % 2 == 0 ? cpy : array;
        for (x = 0; x < size; x++) {
            t[pos] = (array_from[x] >> 8 * pos) & 0xff;
            array_to[counts[pos][t[pos]]] = array_from[x];
            counts[pos][t[pos]]++;
        }
    }
    free(cpy);
    return array;
}

Much shorter and easier to read :) (Not sure if it's better to write array_from/array_to like this or to swap after each iteration)

Godbolt shows that with optimizations turned on, GCC indeed fully unrolls inner loops of length 16 (and seemingly reorders the instructions as necessary).

This version gives yet another insight: every t[pos] variable is written once and used in the same inner loop, and the value is never referenced later (they're simply overwritten in the later loops). So we can optimize u32 t[16] and all t[pos]'s into simply t, giving this:

typedef __uint128_t u128;
typedef uint32_t u32;
u128* radix_sort(u128* array, u32 size) {
    u32 counts[16][256];
    memset(&counts, 0, 256 * 16 * sizeof(u32));
    u128* cpy = (u128*) malloc(size * sizeof(u128));
    u32 o[16] = {0};
    u32 t, x, pos;
    u128* array_from, * array_to;
    for (x = 0; x < size; x++) {
        for (pos = 0; pos < 16; pos++) {
            t = (array[x] >> 8 * pos) & 0xff;
            counts[pos][t]++;
        }
    }
    for (x = 0; x < 256; x++) {
        for (pos = 0; pos < 16; pos++) {
            t = o[pos] + counts[pos][x];
            counts[pos][x] = o[pos];
            o[pos] = t;
        }
    }
    for (pos = 0; pos < 16; pos++) {
        array_from = pos % 2 == 0 ? array : cpy;
        array_to = pos % 2 == 0 ? cpy : array;
        for (x = 0; x < size; x++) {
            t = (array_from[x] >> 8 * pos) & 0xff;
            array_to[counts[pos][t]] = array_from[x];
            counts[pos][t]++;
        }
    }
    free(cpy);
    return array;
}

Problem with making radix sort __uint128_t array

Answers (1)

Related Questions