Nick
Nick

Reputation: 366

Combine incomplete dataframes in R into matrix

I have a list of dataframes in R, each loaded from a different file containing ranks. For example the files could contain finishing positions for different athletes in different races.

The same element (athlete) can appear in more than one dataframe (race) but no dataframe will necessarily contain all elements.

I would like to populate a matrix of rankings with athletes as rows and races as columns. Where there is no ranking for an athlete in a particular race it should read 0.

For example, if I have:

[[1]]
   name rank
1 Alice    1
2   Bob    2
3 Carla    3
4 Diego    4

[[2]]
   name rank
1 Alice    2
2 Carla    1
3  Eric    3
4 Frank    4
5  Gary    5

[[3]]
   name rank
1   Bob    5
2 Carla    4
3 Diego    3
4  Eric    1
5  Gary    2

I would like to generate a matrix:

      1 2 3
Alice 1 2 0
Bob   2 0 5
Carla 3 1 4
Diego 4 0 3
Eric  0 3 1
Frank 0 4 0
Gary  0 5 2

I am looking for an efficient way to do this: my data is more like 200 dataframes and 10000 ranked elements per dataframe (15000 unique elements in total) so the final matrix will be approx 15000x200

Upvotes: 2

Views: 333

Answers (4)

eddi
eddi

Reputation: 49448

Here's a simpler reshape2 solution:

library(reshape2)

dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0)
#   name 1 2 3
#1 Alice 1 2 0
#2   Bob 2 0 5
#3 Carla 3 1 4
#4 Diego 4 0 3
#5  Eric 0 3 1
#6 Frank 0 4 0
#7  Gary 0 5 2

Arun's benchmarks were pretty interesting, and it seems like what data.table does really well is the melting part, and what reshape2 does really well is the dcast, so here's the best of both worlds:

library(reshape2)
library(data.table)

pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
dcast(pp, name ~ id, fill = 0, value.var = 'rank')

Using Arun's benchmark data:

names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="")
names <- unique(names)

dd_create <- function() {
    nrow <- sample(c(100:500), 1)
    ncol <- 3
    data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow))
}

ll <- replicate(1e3, dd_create(), simplify = FALSE)

Arun_data.table <- function(ll) {
    pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
    setkey(pp, "name", "id")
    pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name]
}

mix_of_both = function(ll) {
    pp = rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
    dcast(pp, name ~ id, fill = 0, value.var = 'rank')
}

require(microbenchmark)
microbenchmark(Arun_data.table(ll), mix_of_both(ll), times = 10)
# Unit: milliseconds
#                expr      min        lq    median        uq       max neval
# Arun_data.table(ll) 2568.333 2586.0079 2626.7704 2832.8076 2911.1314    10
#     mix_of_both(ll)  615.166  739.9383  766.8994  788.5822  821.0478    10

Upvotes: 2

Arun
Arun

Reputation: 118859

Here's a solution using reshape2 package:

require(reshape2)
dcast(do.call(rbind, lapply(seq_along(ll), function(ix) 
         transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0)

   name 1 2 3
1 Alice 1 2 0
2   Bob 2 0 5
3 Carla 3 1 4
4 Diego 4 0 3
5  Eric 0 3 1
6 Frank 0 4 0
7  Gary 0 5 2

where ll is your list of data.frames.


or equivalently:

dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))), 
    name ~ id, value.var = "rank", fill = 0)

A data.table solution:

require(data.table)
pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
setkey(pp, "name", "id")
pp[CJ(unique(name), 1:3)][is.na(rank), rank := 0L][, as.list(rank), by = name]

    name V1 V2 V3
1: Alice  1  2  0
2:   Bob  2  0  5
3: Carla  3  1  4
4: Diego  4  0  3
5:  Eric  0  3  1
6: Frank  0  4  0
7:  Gary  0  5  2

Some benchmarking (now that we've quite some answers):

names <- tapply(sample(letters, 1e4, replace=TRUE), rep(1:(1e4/5), each=5), paste, collapse="")
names <- unique(names)

dd_create <- function() {
    nrow <- sample(c(100:500), 1)
    ncol <- 3
    data.frame(name = sample(names, nrow, replace=FALSE), rank = sample(nrow))
}

ll <- replicate(1e3, dd_create(), simplify = FALSE)

require(reshape2)
require(data.table)
Arun1_reshape2 <- function(ll) {
    # same as @agstudy's 
    dcast(do.call(rbind, lapply(seq_along(ll), function(ix) 
             transform(ll[[ix]], id = ix))), name ~ id, value.var="rank", fill=0)
}

Arun2_reshape2 <- function(ll) {
    dcast(transform(do.call(rbind, ll), id = rep(seq_along(ll), sapply(ll, nrow))), 
        name ~ id, value.var = "rank", fill = 0)
}

eddi_reshape2 <- function(ll) {
    dcast(melt(ll, id.vars = 'name'), name ~ L1, fill = 0)
}

Arun_data.table <- function(ll) {
    pp <- rbindlist(ll)[, id := rep(seq_along(ll), sapply(ll, nrow))]
    setkey(pp, "name", "id")
    pp[CJ(unique(name), 1:1000)][is.na(rank), rank := 0L][, as.list(rank), by = name]
}

merge.all <- function(x, y) {
    merge(x, y, all=TRUE, by="name")
}

Hong_Ooi <- function(ll) {
    for(i in seq_along(ll))
        names(ll[[i]])[2] <- paste0("rank", i)
    out <- Reduce(merge.all, ll)    
}

require(microbenchmark)
microbenchmark( arun1 <- Arun1_reshape2(ll), 
                arun2 <- Arun2_reshape2(ll), 
                eddi <- eddi_reshape2(ll), 
                hong <- Hong_Ooi(ll), 
                arun.dt <- Arun_data.table(ll), times=10)

Unit: seconds
                           expr       min        lq    median         uq        max neval
    arun1 <- Arun1_reshape2(ll)  9.157160  9.177143  9.366775   9.715767  28.043125    10
    arun2 <- Arun2_reshape2(ll)  8.408356  8.437066  8.494233   9.018796  10.075029    10
      eddi <- eddi_reshape2(ll)  8.056605  8.314110  8.402396   8.474129   9.124581    10
           hong <- Hong_Ooi(ll) 82.457432 82.716930 82.908646 108.413217 321.164598    10
 arun.dt <- Arun_data.table(ll)  2.006474  2.123331  2.212783   2.311619   2.738914    10

Upvotes: 2

Hong Ooi
Hong Ooi

Reputation: 57696

Another Reduce use case, it seems.

merge.all <- function(x, y)
merge(x, y, all=TRUE, by="name")

# to avoid problems with merged name clashes
for(i in seq_along(ll))
    names(ll[[i]])[2] <- paste0("rank", i)

out <- Reduce(merge.all, ll)

You'll have to modify your data frames slightly to avoid merge complaining about name collisions; a for loop works as well as anything for this purpose.

Any missing races will have NA. You can replace them with 0 by out[is.na(out)] <- 0; you should ask yourself whether this is sensible though. For example, if you do this, then simple summary statistics like means, variances etc will give misleading results. The same holds if you want to do any more complicated modelling. By contrast, most R modelling functions will be smart enough to exclude NAs.

Upvotes: 1

agstudy
agstudy

Reputation: 121608

here the data since the OP don't give a reproducible example :

dput(ll)
list(structure(list(name = structure(1:4, .Label = c("Alice", 
"Bob", "Carla", "Diego"), class = "factor"), rank = 1:4), .Names = c("name", 
"rank"), class = "data.frame", row.names = c("1", "2", "3", "4"
)), structure(list(name = structure(1:5, .Label = c("Alice", 
"Carla", "Eric", "Frank", "Gary"), class = "factor"), rank = c(2L, 
1L, 3L, 4L, 5L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5")), structure(list(name = structure(1:5, .Label = c("Bob", 
"Carla", "Diego", "Eric", "Gary"), class = "factor"), rank = c(5L, 
4L, 3L, 1L, 2L)), .Names = c("name", "rank"), class = "data.frame", row.names = c("1", 
"2", "3", "4", "5")))

Nearly the same solution as @Arun one, but in 2 separtes steps:

## add race column
ll <- lapply(seq_along(ll),function(x){
  ll[[x]]$race <- x
  ll[[x]]
  })
## create a long data.frame
dd <- do.call(rbind,ll)
## transform to the wide format
library(reshape2)

dcast(name~race,data=dd,fill=0,value.var='rank')

   name 1 2 3
1 Alice 1 2 0
2   Bob 2 0 5
3 Carla 3 1 4
4 Diego 4 0 3
5  Eric 0 3 1
6 Frank 0 4 0
7  Gary 0 5 2

Upvotes: 1

Related Questions