Reputation: 366
I have a list of dataframes in R, each loaded from a different file containing ranks. For example the files could contain finishing positions for different athletes in different races.
The same element (athlete) can appear in more than one dataframe (race) but no dataframe will necessarily contain all elements.
I would like to populate a matrix of rankings with athletes as rows and races as columns. Where there is no ranking for an athlete in a particular race it should read 0.
For example, if I have:
[[1]]
name rank
1 Alice 1
2 Bob 2
3 Carla 3
4 Diego 4
[[2]]
name rank
1 Alice 2
2 Carla 1
3 Eric 3
4 Frank 4
5 Gary 5
[[3]]
name rank
1 Bob 5
2 Carla 4
3 Diego 3
4 Eric 1
5 Gary 2
I would like to generate a matrix:
1 2 3
Alice 1 2 0
Bob 2 0 5
Carla 3 1 4
Diego 4 0 3
Eric 0 3 1
Frank 0 4 0
Gary 0 5 2
I am looking for an efficient way to do this: my data is more like 200 dataframes and 10000 ranked elements per dataframe (15000 unique elements in total) so the final matrix will be approx 15000x200
Upvotes: 2
Views: 333
Reputation: 49448
Here's a simpler reshape2
solution:
library(reshape2)
dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0)
# name 1 2 3
#1 Alice 1 2 0
#2 Bob 2 0 5
#3 Carla 3 1 4
#4 Diego 4 0 3
#5 Eric 0 3 1
#6 Frank 0 4 0
#7 Gary 0 5 2
Arun's benchmarks were pretty interesting, and it seems like what data.table
does really well is the melting part, and what reshape2
does really well is the dcast
, so here's the best of both worlds:
library(reshape2)
library(data.table)
pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
dcast(pp, name ~ id, fill = 0, value.var = 'rank')
Using Arun's benchmark data:
names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="")
names <- unique(names)
dd_create <- function() {
nrow <- sample(c(100:500), 1)
ncol <- 3
data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow))
}
ll <- replicate(1e3, dd_create(), simplify = FALSE)
Arun_data.table <- function(ll) {
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name]
}
mix_of_both = function(ll) {
pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
dcast(pp, name ~ id, fill = 0, value.var = 'rank')
}
require(microbenchmark)
microbenchmark(Arun_data.table(ll), mix_of_both(ll), times = 10)
# Unit: milliseconds
# expr min lq median uq max neval
# Arun_data.table(ll) 2568.333 2586.0079 2626.7704 2832.8076 2911.1314 10
# mix_of_both(ll) 615.166 739.9383 766.8994 788.5822 821.0478 10
Upvotes: 2
Reputation: 118859
Here's a solution using reshape2
package:
require(reshape2)
dcast(do.call(rbind, lapply(seq_along(ll), function(ix)
transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0)
name 1 2 3
1 Alice 1 2 0
2 Bob 2 0 5
3 Carla 3 1 4
4 Diego 4 0 3
5 Eric 0 3 1
6 Frank 0 4 0
7 Gary 0 5 2
where ll
is your list of data.frame
s.
or equivalently:
dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))),
name ~ id, value.var = "rank", fill = 0)
A data.table
solution:
require(data.table)
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:3)][is.na(rank), rank := 0L][, as.list(rank), by = name]
name V1 V2 V3
1: Alice 1 2 0
2: Bob 2 0 5
3: Carla 3 1 4
4: Diego 4 0 3
5: Eric 0 3 1
6: Frank 0 4 0
7: Gary 0 5 2
Some benchmarking (now that we've quite some answers):
names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="")
names <- unique(names)
dd_create <- function() {
nrow <- sample(c(100:500), 1)
ncol <- 3
data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow))
}
ll <- replicate(1e3, dd_create(), simplify = FALSE)
require(reshape2)
require(data.table)
Arun1_reshape2 <- function(ll) {
# same as @agstudy's
dcast(do.call(rbind, lapply(seq_along(ll), function(ix)
transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0)
}
Arun2_reshape2 <- function(ll) {
dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))),
name ~ id, value.var = "rank", fill = 0)
}
eddi_reshape2 <- function(ll) {
dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0)
}
Arun_data.table <- function(ll) {
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name]
}
merge.all <- function(x, y) {
merge(x, y, all=TRUE, by="name")
}
Hong_Ooi <- function(ll) {
for(i in seq_along(ll))
names(ll[[i]])[2] <- paste0("rank", i)
out <- Reduce(merge.all, ll)
}
require(microbenchmark)
microbenchmark( arun1 <- Arun1_reshape2(ll),
arun2 <- Arun2_reshape2(ll),
eddi <- eddi_reshape2(ll),
hong <- Hong_Ooi(ll),
arun.dt <- Arun_data.table(ll), times=10)
Unit: seconds
expr min lq median uq max neval
arun1 <- Arun1_reshape2(ll) 9.157160 9.177143 9.366775 9.715767 28.043125 10
arun2 <- Arun2_reshape2(ll) 8.408356 8.437066 8.494233 9.018796 10.075029 10
eddi <- eddi_reshape2(ll) 8.056605 8.314110 8.402396 8.474129 9.124581 10
hong <- Hong_Ooi(ll) 82.457432 82.716930 82.908646 108.413217 321.164598 10
arun.dt <- Arun_data.table(ll) 2.006474 2.123331 2.212783 2.311619 2.738914 10
Upvotes: 2
Reputation: 57696
Another Reduce
use case, it seems.
merge.all <- function(x, y)
merge(x, y, all=TRUE, by="name")
# to avoid problems with merged name clashes
for(i in seq_along(ll))
names(ll[[i]])[2] <- paste0("rank", i)
out <- Reduce(merge.all, ll)
You'll have to modify your data frames slightly to avoid merge
complaining about name collisions; a for
loop works as well as anything for this purpose.
Any missing races will have NA. You can replace them with 0 by out[is.na(out)] <- 0
; you should ask yourself whether this is sensible though. For example, if you do this, then simple summary statistics like means, variances etc will give misleading results. The same holds if you want to do any more complicated modelling. By contrast, most R modelling functions will be smart enough to exclude NAs.
Upvotes: 1
Reputation: 121608
here the data since the OP don't give a reproducible example :
dput(ll)
list(structure(list(name = structure(1:4, .Label = c("Alice",
"Bob", "Carla", "Diego"), class = "factor"), rank = 1:4), .Names = c("name",
"rank"), class = "data.frame", row.names = c("1", "2", "3", "4"
)), structure(list(name = structure(1:5, .Label = c("Alice",
"Carla", "Eric", "Frank", "Gary"), class = "factor"), rank = c(2L,
1L, 3L, 4L, 5L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5")), structure(list(name = structure(1:5, .Label = c("Bob",
"Carla", "Diego", "Eric", "Gary"), class = "factor"), rank = c(5L,
4L, 3L, 1L, 2L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5")))
Nearly the same solution as @Arun one, but in 2 separtes steps:
## add race column
ll <- lapply(seq_along(ll),function(x){
ll[[x]]$race <- x
ll[[x]]
})
## create a long data.frame
dd <- do.call(rbind,ll)
## transform to the wide format
library(reshape2)
dcast(name~race,data=dd,fill=0,value.var='rank')
name 1 2 3
1 Alice 1 2 0
2 Bob 2 0 5
3 Carla 3 1 4
4 Diego 4 0 3
5 Eric 0 3 1
6 Frank 0 4 0
7 Gary 0 5 2
Upvotes: 1