user3354212
user3354212

Reputation: 1112

how to take the majority values across multiple data frames

I have three data frames (df1, df2, df3) that are generated by different methods. They have the same data structure but their values may be different. Each cell is be one of the four values "A","B","H", or "-". I would like to make a consensus table from the three data frames by taking the majority values, otherwise giving "-". Really appreciate any helps.

df1 = read.table(text="ID   S01 S02 S03 S04 S05
M01 A   H   A   B   B
M02 A   H   A   B   A
M03 A   A   H   B   A
M04 B   A   H   B   H
M05 B   A   H   B   A
M06 B   B   H   B   A
M07 H   B   B   H   B
M08 H   B   B   H   A
M09 H   B   B   H   A
M10 H   B   B   H   A", header=T, stringsAsFactors=F)

df2 = read.table(text="ID   S01 S02 S03 S04 S05
M01 A   H   A   B   A
M02 A   H   A   B   A
M03 H   A   H   B   A
M04 H   A   H   B   A
M05 B   A   H   B   A
M06 B   A   B   B   A
M07 -   B   B   -   B
M08 H   B   B   H   A
M09 H   B   B   H   A
M10 H   B   B   H   A", header=T, stringsAsFactors=F)

df3 = read.table(text="ID   S01 S02 S03 S04 S05
M01 B   H   A   B   A
M02 A   H   A   B   A
M03 B   A   H   B   A
M04 B   A   H   B   B
M05 B   A   H   B   A
M06 B   A   H   B   A
M07 A   B   B   H   H
M08 H   B   B   H   A
M09 H   B   B   H   A
M10 H   B   B   H   A", header=T, stringsAsFactors=F)

The expected result:

df = read.table(text="ID    S01 S02 S03 S04 S05
M01 A   H   A   B   A
M02 A   H   A   B   A
M03 -   A   H   B   A
M04 B   A   H   B   -
M05 B   A   H   B   A
M06 B   A   H   B   A
M07 -   B   B   H   B
M08 H   B   B   H   A
M09 H   B   B   H   A
M10 H   B   B   H   A", header=T, stringsAsFactors=F)

Upvotes: 0

Views: 116

Answers (3)

Gregory Demin
Gregory Demin

Reputation: 4836

Base R solution:

options(stringsAsFactors = FALSE)
moda = function(x){
    # here we rely on the fact that we have only three data.frame's
    dupl = anyDuplicated(x)
    if(dupl){
        x[dupl]
    } else {
        "-"
    }
}

aggregate(. ~ ID, 
          data = rbind(df1, df2, df3), 
          FUN = moda
          )

#     ID S01 S02 S03 S04 S05
# 1  M01   A   H   A   B   A
# 2  M02   A   H   A   B   A
# 3  M03   -   A   H   B   A
# 4  M04   B   A   H   B   -
# 5  M05   B   A   H   B   A
# 6  M06   B   A   H   B   A
# 7  M07   -   B   B   H   B
# 8  M08   H   B   B   H   A
# 9  M09   H   B   B   H   A
# 10 M10   H   B   B   H   A

Upvotes: 1

jdobres
jdobres

Reputation: 11957

Similar to @akrun's answer, but I have a slightly different way of joining together the tables and finding the mode of each cell:

Join the tables into "data.master"):

df1$df <- 1
df2$df <- 2
df3$df <- 3

data.master <- do.call(rbind, list(df1, df2, df3))

Compute modes:

library(dplyr)

data.mode <- data.master %>% 
    group_by(ID) %>% 
    summarize_all(function(x) ifelse(sort(table(x), decreasing = T)[1] > 1, names(sort(table(x), decreasing = T))[1], '-')) %>% 
    select(-df)

      ID   S01   S02   S03   S04   S05
   <chr> <chr> <chr> <chr> <chr> <chr>
1    M01     A     H     A     B     A
2    M02     A     H     A     B     A
3    M03     -     A     H     B     A
4    M04     B     A     H     B     -
5    M05     B     A     H     B     A
6    M06     B     A     H     B     A
7    M07     -     B     B     H     B
8    M08     H     B     B     H     A
9    M09     H     B     B     H     A
10   M10     H     B     B     H     A

Upvotes: 2

akrun
akrun

Reputation: 886928

We keep the datasets in a list, rbind it, then grouped by 'ID', loop through the columns, get the Mode of the elements

library(data.table)
Mode <- function(x) {
  if(uniqueN(x)==length(x)){
   "-" } else {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))] }
 }

rbindlist(mget(paste0("df", 1:3)))[, lapply(.SD, Mode) , by = ID]
#      ID S01 S02 S03 S04 S05
# 1: M01   A   H   A   B   A
# 2: M02   A   H   A   B   A
# 3: M03   -   A   H   B   A
# 4: M04   B   A   H   B   -
# 5: M05   B   A   H   B   A
# 6: M06   B   A   H   B   A
# 7: M07   -   B   B   H   B
# 8: M08   H   B   B   H   A
# 9: M09   H   B   B   H   A
#10: M10   H   B   B   H   A

Upvotes: 2

Related Questions