Reputation: 93
I have two data frames in R, and I need to count the element matches row by row, getting finally a column with the length of the cartesian product of both tables and the IDs of both rows. Also, the tables are quite big and with different number of rows, but same number of columns.
I have the following code, but it is quite slow when having multiple runs.
library(data.table)
table_1<-data.table(matrix(c(1:24),nrow = 4))
table_2<-data.table(matrix(c(11:34),nrow = 4))
names(table_1)<-c("s1", "s2","s3","s4","s5","s6")
names(table_2)<-c("a1","a2","a3","a4","a5","a6")
table_1$ID<-seq.int(nrow(table_1))
table_2$ID_ap<-seq.int(nrow(table_2))
setcolorder(table_1, c("ID", "s1", "s2","s3","s4","s5","s6"))
setcolorder(table_2, c("ID_ap","a1","a2","a3","a4","a5","a6"))
CJ.table<-function(X,Y) setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]
join<-CJ.table(table_1,table_2)
R<-subset(join, select=c("ID_ap","ID"))
R$Ac<- (join$s1 == join$a1) + (join$s1 ==join$a2) + (join$s1 ==join$a3) + (join$s1 ==join$a4) + (join$s1 ==join$a5) + (join$s1 ==join$a6)+
(join$s2 == join$a1) + (join$s2 ==join$a2) + (join$s2 ==join$a3) + (join$s2 ==join$a4) + (join$s2 ==join$a5) + (join$s2 ==join$a6)+
(join$s3 == join$a1) + (join$s3 ==join$a2) + (join$s3 ==join$a3) + (join$s3 ==join$a4) + (join$s3 ==join$a5) + (join$s3 ==join$a6)+
(join$s4 == join$a1) + (join$s4 ==join$a2) + (join$s4 ==join$a3) + (join$s4 ==join$a4) + (join$s4 ==join$a5) + (join$s4 ==join$a6)+
(join$s5 == join$a1) + (join$s5 ==join$a2) + (join$s5 ==join$a3) + (join$s5 ==join$a4) + (join$s5 ==join$a5) + (join$s5 ==join$a6)+
(join$s6 == join$a1) + (join$s6 ==join$a2) + (join$s6 ==join$a3) + (join$s6 ==join$a4) + (join$s6 ==join$a5) + (join$s6 ==join$a6)
which gives
R
ID_ap ID Ac
1: 1 1 0
2: 1 2 0
3: 1 3 4
4: 1 4 0
5: 2 1 0
6: 2 2 0
7: 2 3 0
8: 2 4 4
9: 3 1 3
10: 3 2 0
11: 3 3 0
12: 3 4 0
13: 4 1 0
14: 4 2 3
15: 4 3 0
16: 4 4 0
Upvotes: 9
Views: 1118
Reputation: 2743
The performance requirements aren't clearly stated in the post. But, I've created a larger version of your reproducible example (below) and the code in the question is pretty fast already.
Here's how to do it in base R, for good measure:
t1 <- as.data.frame(table_1)
t2 <- as.data.frame(table_2)
system.time({
## compute all combinations of indices
indices <- merge(t1[1], t2[1])
## create a mega df including all rows, cbinded together
rows <- cbind(t1[indices[ ,"ID"], -1], t2[indices[ , "ID_ap"], -1])
t1_cols <- names(rows) %in% names(t1)
t2_cols <- names(rows) %in% names(t2)
## compute the counts; this step takes most of the time
## ~ 14 of the 18 second in this example
counts <- apply(rows, 1, function(r) sum(r[t1_cols] %in% r[t2_cols]))
})
out <- data.frame(indices, Ac=counts)
For example, for the large reproducible problem from below (dim(out) == c(1e6, 3)
), the above code runs in less than 20 seconds.
user system elapsed
17.879 0.348 18.245
Edit Large reproducible problem:
library(data.table)
NROW <- 1e4
NROW2 <- 1e2
table_1<-data.table(matrix(c(1:24),nrow = NROW, ncol=6))
table_2<-data.table(matrix(c(11:34),nrow = NROW2, ncol=6))
names(table_1)<-c("s1", "s2","s3","s4","s5","s6")
names(table_2)<-c("a1","a2","a3","a4","a5","a6")
table_1$ID<-seq.int(nrow(table_1))
table_2$ID_ap<-seq.int(nrow(table_2))
setcolorder(table_1, c("ID", "s1", "s2","s3","s4","s5","s6"))
setcolorder(table_2, c("ID_ap","a1","a2","a3","a4","a5","a6"))
The OP's solution runs much faster than this answer
CJ.table<-function(X,Y) setkey(X[,c(k=1,.SD)],k)[Y[,c(k=1,.SD)],allow.cartesian=TRUE][,k:=NULL]
join<-CJ.table(table_1,table_2)
R<-subset(join, select=c("ID_ap","ID"))
system.time({
R$Ac<- (join$s1 == join$a1) + (join$s1 ==join$a2) + (join$s1 ==join$a3) + (join$s1 ==join$a4) + (join$s1 ==join$a5) + (join$s1 ==join$a6)+
(join$s2 == join$a1) + (join$s2 ==join$a2) + (join$s2 ==join$a3) + (join$s2 ==join$a4) + (join$s2 ==join$a5) + (join$s2 ==join$a6)+
(join$s3 == join$a1) + (join$s3 ==join$a2) + (join$s3 ==join$a3) + (join$s3 ==join$a4) + (join$s3 ==join$a5) + (join$s3 ==join$a6)+
(join$s4 == join$a1) + (join$s4 ==join$a2) + (join$s4 ==join$a3) + (join$s4 ==join$a4) + (join$s4 ==join$a5) + (join$s4 ==join$a6)+
(join$s5 == join$a1) + (join$s5 ==join$a2) + (join$s5 ==join$a3) + (join$s5 ==join$a4) + (join$s5 ==join$a5) + (join$s5 ==join$a6)+
(join$s6 == join$a1) + (join$s6 ==join$a2) + (join$s6 ==join$a3) + (join$s6 ==join$a4) + (join$s6 ==join$a5) + (join$s6 ==join$a6)
})
# user system elapsed
# 0.295 0.044 0.339
but the solution in Frank's answer is faster still
setnames(table_2, "ID_ap", "ID")
tabs = rbind(
melt(table_1, id="ID")[, variable := NULL],
melt(table_2, id="ID")[, variable := NULL],
idcol = TRUE)
system.time({out3 <- tabs[,
if (uniqueN(.id) > 1L) CJ(ID1 = ID[.id == 1L], ID2 = ID[.id == 2L])
, by=value][,
.N
, by=.(ID1, ID2)]})
# user system elapsed
# 0.109 0.013 0.122
Upvotes: 2
Reputation: 13122
Assuming that the product of the number of rows and the number of unique values in both tables is not large:
x1 = unlist(table_1, FALSE, FALSE)
x2 = unlist(table_2, FALSE, FALSE)
with common unique values:
lvs = union(x1, x2)
And tabulate
the occurence of each unique value in each row for each table:
tab1 = matrix(tabulate(seq_len(nrow(table_1)) + (match(x1, lvs) - 1L) * nrow(table_1),
nrow(table_1) * length(lvs)),
nrow(table_1), length(lvs))
tab2 = matrix(tabulate(seq_len(nrow(table_2)) + (match(x2, lvs) - 1L) * nrow(table_2),
nrow(table_2) * length(lvs)),
nrow(table_2), length(lvs))
finally:
tcrossprod(tab1, tab2) #or 'tcrossprod(tab1 > 0L, tab2 > 0L)' to not count duplicate matches
# [,1] [,2] [,3] [,4]
#[1,] 0 0 3 0
#[2,] 0 0 0 3
#[3,] 4 0 0 0
#[4,] 0 4 0 0
#and to change format (among different ways):
ans = tcrossprod(tab1, tab2)
cbind(c(row(ans)), c(col(ans)), c(ans))
If tab1
and tab2
are very large, they can be built as sparse matrices and a way could be:
library(Matrix)
stab1 = xtabs(rep_len(1L, length(x1)) ~
rep_len(seq_len(nrow(table_1)), length(x1))
+ factor(match(x1, lvs), lvs),
sparse = TRUE)
stab2 = xtabs(rep_len(1L, length(x2)) ~
rep_len(seq_len(nrow(table_2)), length(x2))
+ factor(match(x2, lvs), lvs),
sparse = TRUE)
tcrossprod(stab1, stab2)
#4 x 4 sparse Matrix of class "dgCMatrix"
# 1 2 3 4
#1 . . 3 .
#2 . . . 3
#3 4 . . .
#4 . 4 . .
EDIT
Having (1) small positive integer values and (2) distinct values in each row, creating lookups with match
/unique
/union
and tabulating can be avoided:
x1 = unlist(table_1, FALSE, FALSE)
x2 = unlist(table_2, FALSE, FALSE)
nlvs = max(max(x1), max(x2))
stab1 = sparseMatrix(i = rep_len(seq_len(nrow(table_1)), length(x1)),
j = x1,
x = 1L,
dims = c(nrow(table_1), nlvs))
stab2 = sparseMatrix(i = rep_len(seq_len(nrow(table_2)), length(x2)),
j = x2,
x = 1L,
dims = c(nrow(table_2), nlvs))
tcrossprod(stab1, stab2)
#4 x 4 sparse Matrix of class "dgCMatrix"
#
#[1,] . . 3 .
#[2,] . . . 3
#[3,] 4 . . .
#[4,] . 4 . .
summary(tcrossprod(stab1, stab2))
#4 x 4 sparse Matrix of class "dgCMatrix", with 4 entries
# i j x
#1 3 1 4
#2 4 2 4
#3 1 3 3
#4 2 4 3
Upvotes: 3
Reputation: 49448
How about:
colSums(apply(join[, !c("ID", "ID_ap"), with = F], 1, duplicated))
#[1] 0 0 4 0 0 0 0 4 3 0 0 0 0 3 0 0
Or, starting from scratch:
setkey(table_1, ID)
setkey(table_2, ID_ap)
ids = CJ(ID1 = table_1$ID, ID2 = table_2$ID_ap)
ids[, sum(duplicated(c(table_1[.(ID1), !'ID', with = F],
table_2[.(ID2), !'ID_ap', with = F])))
, by = .(ID1, ID2)]
# ID1 ID2 V1
# 1: 1 1 0
# 2: 1 2 0
# 3: 1 3 3
# 4: 1 4 0
# 5: 2 1 0
# 6: 2 2 0
# 7: 2 3 0
# 8: 2 4 3
# 9: 3 1 4
#10: 3 2 0
#11: 3 3 0
#12: 3 4 0
#13: 4 1 0
#14: 4 2 4
#15: 4 3 0
#16: 4 4 0
Upvotes: 2
Reputation: 66819
Put the data in long format, since the column order does not matter:
setnames(table_2, "ID_ap", "ID")
tabs = rbind(
melt(table_1, id="ID")[, variable := NULL],
melt(table_2, id="ID")[, variable := NULL],
idcol = TRUE)
(1) For each value, identify relevant pairs; and
(2) for pairs, count values:
tabs[,
if (uniqueN(.id) > 1L) CJ(ID1 = ID[.id == 1L], ID2 = ID[.id == 2L])
, by=value][,
.N
, by=.(ID1, ID2)]
ID1 ID2 N
1: 3 1 4
2: 4 2 4
3: 1 3 3
4: 2 4 3
All other (ID1, ID2)
combos are zero and need not be explicitly enumerated, I think.
If values are distinct within each table, as in the OP's example, then we can simplify:
tabs[, if (.N==2L) .(ID1 = ID[1L], ID2 = ID[2L]), by=value][, .N, by=.(ID1, ID2)]
Upvotes: 5
Reputation: 15163
Here's one possibility:
> t1<-data.frame(matrix(c(1:24),nrow = 4))
> t2<-data.frame(matrix(c(11:34),nrow = 4))
> ret<-expand.grid(r1=1:nrow(t1),r2=1:nrow(t2))
> ret$matches<-apply(ret,1,function(a)sum(t1[a[1],] %in% t2[a[2],]))
> ret
r1 r2 matches
1 1 1 0
2 2 1 0
3 3 1 4
4 4 1 0
5 1 2 0
6 2 2 0
7 3 2 0
8 4 2 4
9 1 3 3
10 2 3 0
11 3 3 0
12 4 3 0
13 1 4 0
14 2 4 3
15 3 4 0
16 4 4 0
Upvotes: 1