Reputation: 4949

Average every duplicated column for each row?

I have the following dataframe.

temp = structure(list(A = c(0, 0, 0, 3.72900887033786, 1.94860084749336, 
0), C = c(0, 0, 0, 3.44095219802964, 2.35049724708413, 0.0285691521967709
), A = c(0, 0, 0, 3.29572302453997, 0.933572638261024, 0), D = c(0, 
0, 0, 2.4905701304462, 1.54101915313356, 0), E = c(0, 0, 0, 4.23189316164533, 
1.7311832415722, 0), E = c(0, 0, 0, 4.37851162325373, 2.50080205305716, 
0), D = c(0, 0, 0, 3.68929916053589, 2.4905701304462, 0.189033824390017
), F = c(0, 2.27500704749987, 0, 3.68032435684402, 1.77820857639809, 
0), A = c(0, 0, 0, 3.5668151540109, 1.72683121703249, 0.0285691521967709
), G = c(0, 0, 0, 5.6450098843911, 3.09929520433778, 0)), row.names = c("5_8S_rRNA", 
"5S_rRNA", "7SK", "A1BG", "A1BG-AS1", "A1CF"), class = "data.frame")

It looks like this.

                 A          C         A        D        E        E         D        F          A        G
5_8S_rRNA 0.000000 0.00000000 0.0000000 0.000000 0.000000 0.000000 0.0000000 0.000000 0.00000000 0.000000
5S_rRNA   0.000000 0.00000000 0.0000000 0.000000 0.000000 0.000000 0.0000000 2.275007 0.00000000 0.000000
7SK       0.000000 0.00000000 0.0000000 0.000000 0.000000 0.000000 0.0000000 0.000000 0.00000000 0.000000
A1BG      3.729009 3.44095220 3.2957230 2.490570 4.231893 4.378512 3.6892992 3.680324 3.56681515 5.645010
A1BG-AS1  1.948601 2.35049725 0.9335726 1.541019 1.731183 2.500802 2.4905701 1.778209 1.72683122 3.099295
A1CF      0.000000 0.02856915 0.0000000 0.000000 0.000000 0.000000 0.1890338 0.000000 0.02856915 0.000000

What I'd like to do is collapse any column that are duplicates by averaging duplicates, but I want to do this for each row.

The ideal dataframe would contain the same amount of rows, but would only contain columns A, C, D, E, F, G. Is this possible?

Upvotes: 8

Answers (6)

ThomasIsCoding

Reputation: 102625

A dplyr + tidyr option using pivot_*

library(dplyr)
library(tidyr)

temp %>%
  cbind(rmn = rownames(temp)) %>%
  pivot_longer(!rmn) %>%
  group_by(rmn, name) %>%
  summarise(value = mean(value), .groups = "drop") %>%
  pivot_wider(names_from = name) %>%
  column_to_rownames("rmn")

gives

                    A          C          D        E        F        G
5_8S_rRNA 0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
5S_rRNA   0.000000000 0.00000000 0.00000000 0.000000 2.275007 0.000000
7SK       0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
A1BG      3.530515683 3.44095220 3.08993465 4.305202 3.680324 5.645010
A1BG-AS1  1.536334901 2.35049725 2.01579464 2.115993 1.778209 3.099295
A1CF      0.009523051 0.02856915 0.09451691 0.000000 0.000000 0.000000

Or we can use xtabs a bit

temp %>%
  cbind(rmn = rownames(temp)) %>%
  pivot_longer(!rmn) %>%
  group_by(rmn, name) %>%
  summarise(value = mean(value), .groups = "drop") %>%
  xtabs(value ~ ., .)

which yields

           name
rmn                   A           C           D           E           F
  5_8S_rRNA 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
  5S_rRNA   0.000000000 0.000000000 0.000000000 0.000000000 2.275007047
  7SK       0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
  A1BG      3.530515683 3.440952198 3.089934645 4.305202392 3.680324357
  A1BG-AS1  1.536334901 2.350497247 2.015794642 2.115992647 1.778208576
  A1CF      0.009523051 0.028569152 0.094516912 0.000000000 0.000000000
           name
rmn                   G
  5_8S_rRNA 0.000000000
  5S_rRNA   0.000000000
  7SK       0.000000000
  A1BG      5.645009884
  A1BG-AS1  3.099295204
  A1CF      0.000000000

Upvotes: 0

ProfFansyPants

Reputation: 29

Using data.table

library('data.table')
d <- transpose(as.data.table(temp), keep.names = "rn")
d <- d[, by = rn, lapply(.SD, mean)]
d <- cbind(rn = row.names(temp), transpose(d, make.names = T))

Upvotes: 2

AndrewGB

Reputation: 16876

Here is a base R solution:

t(do.call(rbind, by(t(temp), row.names(t(temp)), FUN = colMeans)))

Or with tidyverse

Here is a tidyverse option, where we can split on the unique names, then we set the names for each dataframe, then use rowMeans. However, we have to use setNames in the first map statement to make the column names unique, as tidyverse does not like duplicate column names. Then, since the rownames also get dropped, then we can add them back at the end.

library(tidyverse)

map(.x = unique(names(temp)), ~ 
      select(setNames(temp, make.names(names(temp), unique = TRUE)), starts_with(.x))) %>% 
  set_names(unique(names(temp))) %>% 
  map_dfc(., rowMeans) %>%
  as.data.frame() %>% 
  `rownames<-`(row.names(temp))

Or another base R solution:

temp2 <- t(temp)
t(tapply(temp2, list(row.names(temp2)[row(temp2)], colnames(temp2)[col(temp2)]), FUN = mean))

Output

                   A         C         D       E       F       G
5_8S_rRNA 0.00000000 0.0000000 0.0000000 0.00000 0.00000 0.00000
5S_rRNA   0.00000000 0.0000000 0.0000000 0.00000 2.27501 0.00000
7SK       0.00000000 0.0000000 0.0000000 0.00000 0.00000 0.00000
A1BG      3.53051568 3.4409522 3.0899346 4.30520 3.68032 5.64501
A1BG-AS1  1.53633490 2.3504972 2.0157946 2.11599 1.77821 3.09930
A1CF      0.00952305 0.0285692 0.0945169 0.00000 0.00000 0.00000

Upvotes: 6

Maël

Reputation: 52329

Another base R solution with rowsum:

t(rowsum(t(temp), names(temp)) / c(table(names(temp))))

                    A          C          D        E        F        G
5_8S_rRNA 0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
5S_rRNA   0.000000000 0.00000000 0.00000000 0.000000 2.275007 0.000000
7SK       0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
A1BG      3.530515683 3.44095220 3.08993465 4.305202 3.680324 5.645010
A1BG-AS1  1.536334901 2.35049725 2.01579464 2.115993 1.778209 3.099295
A1CF      0.009523051 0.02856915 0.09451691 0.000000 0.000000 0.000000

Upvotes: 7

Jilber Urbina

Reputation: 61214

Is this what you're looking for?

library(tidyr)                                                                                                                                                                                                                                                  
library(dplyr)

output  <- temp %>% 
  add_rownames(var = "ID") %>% 
  pivot_longer(-ID) %>% 
  mutate(name = gsub("\\.*\\d*", "", name)) %>% 
  group_by(ID, name) %>% 
  summarise(value = mean(value)) %>% 
  mutate(name = gsub("\\.*\\d*", "", name)) %>% 
  pivot_wider(names_from = "name", values_from = "value")

output  
# A tibble: 6 × 7
# Groups:   ID [6]
  ID              A      C      D     E     F     G
  <chr>       <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>
1 5_8S_rRNA 0       0      0       0     0     0   
2 5S_rRNA   0       0      0       0     2.28  0   
3 7SK       0       0      0       0     0     0   
4 A1BG      3.53    3.44   3.09    4.31  3.68  5.65
5 A1BG-AS1  1.54    2.35   2.02    2.12  1.78  3.10
6 A1CF      0.00952 0.0286 0.0945  0     0     0

Upvotes: 4

akrun

Reputation: 887851

We could use split.default to split by the column names and loop over the list, apply the rowMeans

 sapply(split.default(temp, names(temp)), rowMeans)
                    A          C          D        E        F        G
5_8S_rRNA 0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
5S_rRNA   0.000000000 0.00000000 0.00000000 0.000000 2.275007 0.000000
7SK       0.000000000 0.00000000 0.00000000 0.000000 0.000000 0.000000
A1BG      3.530515683 3.44095220 3.08993465 4.305202 3.680324 5.645010
A1BG-AS1  1.536334901 2.35049725 2.01579464 2.115993 1.778209 3.099295
A1CF      0.009523051 0.02856915 0.09451691 0.000000 0.000000 0.000000

Upvotes: 9

Average every duplicated column for each row?

Answers (6)

Related Questions