husna
husna

Reputation: 187

Efficient way to find rows with same elements in a 3D matrix in C

I have a 3D matrix mat[100][100][100]. What is the efficient way to find a row with same elements that appears in mat[0][][], mat[1][][],....,mat[99][][]? A simple approach would be comparing each row of mat[0][][] to all rows of the remaining 99 matrices, but it wouldn't be very efficient(I guess). Is there a better way to do it?

Upvotes: 3

Views: 192

Answers (3)

Bing Bang
Bing Bang

Reputation: 544

I finally found some time to write the content addressable code. It turns out to be much faster than using hash tables. But the catch is that the code is way more complex and the program takes WAY more memory. My final opinion is that unless you really need the extra speed, stick with the hash table.

Some examples of test runs are given below. The argument to the program specify the number of unique rows. The program fills the rest with randomly chosen existing rows. Then the rows are shuffled. The program looks for all duplicate rows and reports the number of duplicate rows and the time it took for both hash and content addressable tables.

bing@mint5 ~ $ cc -O2 cattest.c -o cattest
bing@mint5 ~ $ ./cattest 500
CAT Test        9500    0.0083
Hash Test       9500    0.0499
bing@mint5 ~ $ ./cattest 5000
CAT Test        5000    0.0195
Hash Test       5000    0.1477
bing@mint5 ~ $ ./cattest 9000
CAT Test        1000    0.0321
Hash Test       1000    0.1092






/* content addressable table vs hash table */
/* written by Bing H Bang */
/* I DONOT give permission to any snot-nosed students to copy my work and turn it in
as their own */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include <errno.h>
#include <string.h>
#include <sys/time.h>
#include <sys/sysinfo.h>

double etime()
{
        struct timeval tv;
    double dt, df;

    gettimeofday(&tv, NULL);
    dt = (double)(tv.tv_sec);
    df = ((double)(tv.tv_usec))/1000000.0;
    return(dt+df);
}

struct CAT_entry
    {
    unsigned fval;
    unsigned short rows[10000];
    unsigned short num;
    unsigned short used;
    struct CAT_entry *next;
    } *CAT[256] = {NULL};

struct CAT_entry stmem[10000];
int stidx = 0;

unsigned dat[100][10000];
char map[10000];
unsigned hasht[10000];

#define highbit (1 << ((sizeof(unsigned)*8)-1))

unsigned
rotxor(unsigned sum, unsigned v)
{
    if((sum & highbit) == 0)
        return ((sum << 1) ^ v);
    else
        return (((sum << 1) | 1) ^ v);
}

unsigned
compute_hash(int y)
{
    int x;
    unsigned sum = 0;

    for(x = 0; x < 100; ++x)
        sum = rotxor(sum, dat[x][y]);

    return sum;
}

void
mk_hasht()
{
    int y;

    for(y = 0; y < 10000; ++y)
        hasht[y] = compute_hash(y);
}

clearmap()
{
    memset((void *)map, 0, 10000);
}

comprow(int y, int yd)
{
    int x;

    for(x = 0; x < 100; ++x)
        if(dat[x][y] != dat[x][yd])
            return 0;

    return 1;
}

struct CAT_entry **
srch_CAT(unsigned value)
{
    struct CAT_entry **p = &(CAT[value&255]);
    static struct CAT_entry *r = NULL;

    while(*p != NULL)
        {
        if((*p)->fval == value)
            break;

        if((*p)->fval > value)
            return &r;
        else
            p = &((*p)->next);
        }

    return p;
}

void
add_entry(int y, unsigned value)
{
    struct CAT_entry **p = &(CAT[value&255]), *q;

    while(*p != NULL)
        {
        q = *p;
        if(q->fval == value)
            {
            q->rows[q->num] = y;
            q->num++;
            return;
            }

        if(q->fval > value)
            break;

        p = &(q->next);
        }

    q = *p;
    //*p = malloc(sizeof(struct CAT_entry));

    *p = &stmem[stidx++];

    (*p)->next = q;
    q = *p;
    q->fval = value;
    q->num = 0;
    q->used = 0;
}

void
mk_CAT()
{
    int x,y;
    struct CAT_entry **p, *q;

    for(y = 0; y < 10000; y++)
        add_entry(y, dat[0][y]);

    for(x=0; x < 256; ++x)
        {
        p = &(CAT[x]);
        while(*p != NULL)
            {
            q = *p;
            if(q->num == 0)
                {
                *p = q->next;
                //free(q);
                }
            else
                p = &(q->next);
            }
        }
}

void
gen_data(int npat)
{
    int x, y, rnum, limit;
    unsigned r;

    srandom(time(NULL));
    rnum = npat * 100;

    for(y = 0; y < rnum; ++y)
        dat[y%100][y/100] = random();

    for(y = npat; y < 10000; ++y)
        {
        rnum = random() % npat;
        for(x = 0; x < 100; ++x)
            dat[x][y]=dat[x][rnum];
        }

    for(y = 0; y < 10000; ++y)
        {
        rnum = random() % 10000;
        if(rnum == y)
            continue;

        for(x = 0; x < 100; ++x)
            {
            r = dat[x][y];
            dat[x][y]=dat[x][rnum];
            dat[x][rnum] = r;
            }
        }
}

int
do_CATtest()
{
    int y, yd, count = 0, i;
    struct CAT_entry **p, *q;

    mk_CAT();
    clearmap();

    for(y = 0; y < 9999; ++y)
        {
        if(map[y] == 0)
            {
            map[y] = 1;
            if(*(p = srch_CAT(dat[0][y])) != NULL)
                {
                for(q = *p, i = 0; i < q->num; ++i)
                    {
                    yd = q->rows[i];
                    if(map[yd] == 0)
                        {
                        if(comprow(y, yd))
                            {
                            map[yd] = 1;
                            ++count;
                            q->used++;
                            }
                        }
                    }

                if(q->num <= q->used)
                    *p = q->next;
                }
            }
        }

    return count;
}

int
do_hashtest()
{
    unsigned h;
    int x, y, yd, count = 0;

    mk_hasht();
    clearmap();

    for(y = 0; y < 9999; ++y)
        {
        if(map[y] != 0)
            continue;

        map[y] = 1;
        h = hasht[y];
        for(yd = y+1; yd < 10000; ++yd)
            {
            if(map[yd] != 0)
                continue;

            if(h == hasht[yd])
                if(comprow(y, yd))
                    {
                    map[yd] = 1;
                    ++count;
                    }
            }
        }

    return count;
}

main(int c, char *v[])
{
    int npat = 0, count;
    double t1, t2;

    if(c == 2)
        npat = atoi(v[1]);

    if(npat <= 0 || npat >= 10000)
        {
        puts("input param error");
        exit(1);
        }

    gen_data(npat);
    npat = 10000 - npat;

    t1 = etime();

    if((count = do_CATtest()) != npat)
        {
        printf("CAT test error, %d matches found, not %d", count, npat);
        exit(1);
        }

    t2 = etime();

    printf("CAT Test    %d  %.4f\n", npat, t2-t1);

    t1 = etime();
    if((count = do_hashtest()) != npat)
        {
        printf("hash test error, %d matches found, not %d", count, npat);
        exit(1);
        }

    t2 = etime();

    printf("Hash Test   %d  %.4f\n", npat, t2-t1);
}

Upvotes: 1

Bing Bang
Bing Bang

Reputation: 544

Make a content addressable table of the first values in each row. Then go through each row, take the first value and look it up on the table. If the lookup returns multiple rows, then those rows should be checked for a match. The searched rows should be remembered as to increase efficiency because the checked rows need not be checked again. You'll end up with a list of identical row groupings.

Upvotes: 0

user3386109
user3386109

Reputation: 34839

To expand on the comment by @chux, the first step is to compute a hash value for each row of each matrix. That's 10000 hash values in all. The results should be stored in an array of 10000 structs.

struct info
{
    int m;          // the matrix number
    int row;        // the row number
    uint32_t hash;  // the hash value for mat[m][row]
};

static struct info hashArray[10000];

After filling in all 10000 entries of the hashArray, sort the array by hash value. Then you can simply scan the array to find any duplicate hash values. When you do find duplicates, you need to confirm by comparing the row elements.

Upvotes: 1

Related Questions