Reputation: 187
I have a 3D matrix mat[100][100][100]
. What is the efficient way to find a row with same elements that appears in mat[0][][], mat[1][][],....,mat[99][][]
?
A simple approach would be comparing each row of mat[0][][]
to all rows of the remaining 99 matrices, but it wouldn't be very efficient(I guess). Is there a better way to do it?
Upvotes: 3
Views: 192
Reputation: 544
I finally found some time to write the content addressable code. It turns out to be much faster than using hash tables. But the catch is that the code is way more complex and the program takes WAY more memory. My final opinion is that unless you really need the extra speed, stick with the hash table.
Some examples of test runs are given below. The argument to the program specify the number of unique rows. The program fills the rest with randomly chosen existing rows. Then the rows are shuffled. The program looks for all duplicate rows and reports the number of duplicate rows and the time it took for both hash and content addressable tables.
bing@mint5 ~ $ cc -O2 cattest.c -o cattest
bing@mint5 ~ $ ./cattest 500
CAT Test 9500 0.0083
Hash Test 9500 0.0499
bing@mint5 ~ $ ./cattest 5000
CAT Test 5000 0.0195
Hash Test 5000 0.1477
bing@mint5 ~ $ ./cattest 9000
CAT Test 1000 0.0321
Hash Test 1000 0.1092
/* content addressable table vs hash table */
/* written by Bing H Bang */
/* I DONOT give permission to any snot-nosed students to copy my work and turn it in
as their own */
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include <errno.h>
#include <string.h>
#include <sys/time.h>
#include <sys/sysinfo.h>
double etime()
{
struct timeval tv;
double dt, df;
gettimeofday(&tv, NULL);
dt = (double)(tv.tv_sec);
df = ((double)(tv.tv_usec))/1000000.0;
return(dt+df);
}
struct CAT_entry
{
unsigned fval;
unsigned short rows[10000];
unsigned short num;
unsigned short used;
struct CAT_entry *next;
} *CAT[256] = {NULL};
struct CAT_entry stmem[10000];
int stidx = 0;
unsigned dat[100][10000];
char map[10000];
unsigned hasht[10000];
#define highbit (1 << ((sizeof(unsigned)*8)-1))
unsigned
rotxor(unsigned sum, unsigned v)
{
if((sum & highbit) == 0)
return ((sum << 1) ^ v);
else
return (((sum << 1) | 1) ^ v);
}
unsigned
compute_hash(int y)
{
int x;
unsigned sum = 0;
for(x = 0; x < 100; ++x)
sum = rotxor(sum, dat[x][y]);
return sum;
}
void
mk_hasht()
{
int y;
for(y = 0; y < 10000; ++y)
hasht[y] = compute_hash(y);
}
clearmap()
{
memset((void *)map, 0, 10000);
}
comprow(int y, int yd)
{
int x;
for(x = 0; x < 100; ++x)
if(dat[x][y] != dat[x][yd])
return 0;
return 1;
}
struct CAT_entry **
srch_CAT(unsigned value)
{
struct CAT_entry **p = &(CAT[value&255]);
static struct CAT_entry *r = NULL;
while(*p != NULL)
{
if((*p)->fval == value)
break;
if((*p)->fval > value)
return &r;
else
p = &((*p)->next);
}
return p;
}
void
add_entry(int y, unsigned value)
{
struct CAT_entry **p = &(CAT[value&255]), *q;
while(*p != NULL)
{
q = *p;
if(q->fval == value)
{
q->rows[q->num] = y;
q->num++;
return;
}
if(q->fval > value)
break;
p = &(q->next);
}
q = *p;
//*p = malloc(sizeof(struct CAT_entry));
*p = &stmem[stidx++];
(*p)->next = q;
q = *p;
q->fval = value;
q->num = 0;
q->used = 0;
}
void
mk_CAT()
{
int x,y;
struct CAT_entry **p, *q;
for(y = 0; y < 10000; y++)
add_entry(y, dat[0][y]);
for(x=0; x < 256; ++x)
{
p = &(CAT[x]);
while(*p != NULL)
{
q = *p;
if(q->num == 0)
{
*p = q->next;
//free(q);
}
else
p = &(q->next);
}
}
}
void
gen_data(int npat)
{
int x, y, rnum, limit;
unsigned r;
srandom(time(NULL));
rnum = npat * 100;
for(y = 0; y < rnum; ++y)
dat[y%100][y/100] = random();
for(y = npat; y < 10000; ++y)
{
rnum = random() % npat;
for(x = 0; x < 100; ++x)
dat[x][y]=dat[x][rnum];
}
for(y = 0; y < 10000; ++y)
{
rnum = random() % 10000;
if(rnum == y)
continue;
for(x = 0; x < 100; ++x)
{
r = dat[x][y];
dat[x][y]=dat[x][rnum];
dat[x][rnum] = r;
}
}
}
int
do_CATtest()
{
int y, yd, count = 0, i;
struct CAT_entry **p, *q;
mk_CAT();
clearmap();
for(y = 0; y < 9999; ++y)
{
if(map[y] == 0)
{
map[y] = 1;
if(*(p = srch_CAT(dat[0][y])) != NULL)
{
for(q = *p, i = 0; i < q->num; ++i)
{
yd = q->rows[i];
if(map[yd] == 0)
{
if(comprow(y, yd))
{
map[yd] = 1;
++count;
q->used++;
}
}
}
if(q->num <= q->used)
*p = q->next;
}
}
}
return count;
}
int
do_hashtest()
{
unsigned h;
int x, y, yd, count = 0;
mk_hasht();
clearmap();
for(y = 0; y < 9999; ++y)
{
if(map[y] != 0)
continue;
map[y] = 1;
h = hasht[y];
for(yd = y+1; yd < 10000; ++yd)
{
if(map[yd] != 0)
continue;
if(h == hasht[yd])
if(comprow(y, yd))
{
map[yd] = 1;
++count;
}
}
}
return count;
}
main(int c, char *v[])
{
int npat = 0, count;
double t1, t2;
if(c == 2)
npat = atoi(v[1]);
if(npat <= 0 || npat >= 10000)
{
puts("input param error");
exit(1);
}
gen_data(npat);
npat = 10000 - npat;
t1 = etime();
if((count = do_CATtest()) != npat)
{
printf("CAT test error, %d matches found, not %d", count, npat);
exit(1);
}
t2 = etime();
printf("CAT Test %d %.4f\n", npat, t2-t1);
t1 = etime();
if((count = do_hashtest()) != npat)
{
printf("hash test error, %d matches found, not %d", count, npat);
exit(1);
}
t2 = etime();
printf("Hash Test %d %.4f\n", npat, t2-t1);
}
Upvotes: 1
Reputation: 544
Make a content addressable table of the first values in each row. Then go through each row, take the first value and look it up on the table. If the lookup returns multiple rows, then those rows should be checked for a match. The searched rows should be remembered as to increase efficiency because the checked rows need not be checked again. You'll end up with a list of identical row groupings.
Upvotes: 0
Reputation: 34839
To expand on the comment by @chux, the first step is to compute a hash value for each row of each matrix. That's 10000 hash values in all. The results should be stored in an array of 10000 structs.
struct info
{
int m; // the matrix number
int row; // the row number
uint32_t hash; // the hash value for mat[m][row]
};
static struct info hashArray[10000];
After filling in all 10000 entries of the hashArray
, sort the array by hash value. Then you can simply scan the array to find any duplicate hash values. When you do find duplicates, you need to confirm by comparing the row elements.
Upvotes: 1