Mussa
Mussa

Reputation: 117

Taking average over some columns

I have such data

        > dput(head(my_data))
structure(list(`Sample Name` = c("ACTB", "ATP5F1", "DDX5", "EEF1G", 
"GAPDH", "NCL"), `31-29TRG1R1_1` = c(4526, 884, 11044, 10661, 
31760, 9417), `31-25TRG2R1_1` = c(5311, 841, 6401, 16073, 20432, 
4969), `31-18TRG1R1_1` = c(12487, 567, 13945, 16474, 43309, 11831
), `31-44TRG2R1_1` = c(9477, 1244, 7140, 6580, 12457, 5176), 
    `31-32TRG2R1_1` = c(13386, 1535, 14153, 11036, 26186, 8299
    ), `31-21TRG3R1_1` = c(8511, 1703, 8966, 14800, 57140, 7002
    ), `31-14TRG4R1_1` = c(9935, 1841, 19590, 16467, 34281, 12463
    ), `31-19TRG4R1_1` = c(5211, 917, 9144, 8024, 8200, 3935), 
    `31-29TRG1R2_1` = c(3181, 613, 7584, 7603, 19331, 6286), 
    `31-25TRG2R2_1` = c(115931, 11738, 88604, 255822, 447002, 
    70197), `31-23TRG1R1_1` = c(3156, 765, 18034, 18982, 17237, 
    18880), `31-RESECTION1_1` = c(3470, 506, 8858, 11829, 29934, 
    7857), `31-32TRG2R2_1` = c(241372, 21547, 165886, 138726, 
    442723, 94629), `31-21TRG3R2_1` = c(5102, 1217, 5977, 10041, 
    35615, 4740), `31-14TRG4R2_1` = c(5352, 1029, 14053, 10995, 
    18206, 8633), `31-24TRG4R1_1` = c(6990, 761, 4440, 2833, 
    8150, 1340), `31-46TRG1R1_1` = c(4290, 834, 8146, 22078, 
    13631, 7407), `31-27TRG2R1_1` = c(11262, 1153, 10063, 13690, 
    19430, 6470), `31-33TRG1R1_1` = c(4139, 338, 8260, 8650, 
    13916, 8000), `31-RESECTION1_2` = c(2812, 497, 8503, 10674, 
    22677, 7394), `31-53TRG2R1_1` = c(7438, 921, 6897, 10042, 
    15142, 8073), `31-22TRG3R1_1` = c(4784, 621, 9904, 7049, 
    13977, 4279), `31-6TRG4R1_1` = c(6749, 704, 9638, 12920, 
    14396, 8133), `31-17TRG3R1_1` = c(10074, 1056, 7966, 12489, 
    26819, 6404), `31-46TRG1R2_1` = c(6586, 1160, 10305, 30012, 
    21850, 9848), `31-27TRG2R2_1` = c(10368, 866, 7324, 10047, 
    17939, 4943), `31-5TRG2R1_1` = c(5479, 440, 8913, 7103, 15886, 
    5801), `31-RESECTION1_3` = c(3032, 474, 8484, 11152, 24847, 
    7741), `31-53TRG2R2_1` = c(10587, 1315, 8675, 12923, 23882, 
    10213), `31-22TRG3R2_1` = c(4885, 660, 9710, 7115, 15669, 
    4908), `31-6TRG4R2_1` = c(6683, 650, 7987, 11447, 14364, 
    7236), `31-16TRG4R1_1` = c(5428, 825, 11789, 12018, 6812, 
    5954), `31-9TRG1R1_1` = c(5723, 494, 7504, 5591, 9086, 2966
    ), `31-26TRG3R1_1` = c(2564, 465, 2115, 2810, 4022, 955), 
    `31-20TRG2R1_1` = c(1018, 491, 2325, 3460, 3469, 1863), `31-RESECTION2_1` = c(1011, 
    285, 2893, 4345, 6476, 1818), `31-52TRG2R1_1` = c(4128, 622, 
    5952, 5196, 7428, 4181), `31-38TRG3R1_1` = c(2281, 521, 12043, 
    8111, 16866, 5207), `31-49TRG4R1_1` = c(4566, 453, 8521, 
    9618, 26104, 8663), `31-15TRG4R1_1` = c(6501, 947, 10320, 
    7285, 10538, 4638), `31-9TRG1R2_1` = c(13212, 1131, 14045, 
    11418, 21843, 6245), `31-26TRG3R2_1` = c(5832, 972, 3012, 
    4851, 7559, 1560), `31-37TRG2R1_1` = c(7031, 690, 8208, 17495, 
    28514, 7058), `31-RESECTION2_2` = c(4095, 620, 6319, 10473, 
    25292, 4274), `31-52TRG2R1_2` = c(10240, 2169, 19618, 17052, 
    18671, 13688), `31-38TRG3R2_1` = c(5009, 874, 20191, 14132, 
    34612, 8855), `31-49TRG4R2_1` = c(10911, 1131, 19318, 21560, 
    58967, 19450), `31-48TRG2R1_1` = c(8758, 1002, 8044, 6677, 
    17354, 7355), `31-2TRG1R1_1` = c(10461, 1904, 15286, 16249, 
    15900, 14393), `31-36TRG3R1_1` = c(16430, 1401, 9522, 17646, 
    26764, 8063), `31-51TRG2R1_1` = c(151880, 16572, 93610, 110556, 
    303604, 57029), `31-RESECTION2_3` = c(9686, 1435, 14038, 
    23464, 61018, 8921), `31-42TRG2R1_1` = c(38649, 3795, 24468, 
    36134, 68794, 17827), `31-3TRG4R1_1` = c(14421, 1901, 34905, 
    21004, 47952, 12428), `31-35TRG4R1_1` = c(59268, 4752, 73216, 
    64854, 212997, 37064), `31-50TRG4R1_1` = c(6660, 701, 4092, 
    16796, 7958, 2408), `31-2TRG1R2_1` = c(4845, 1086, 9191, 
    8887, 6692, 8119), `31-36TRG3R2_1` = c(8418, 868, 6832, 11279, 
    13530, 5233), `31-28TRG4R1_1` = c(7980, 1343, 7342, 9874, 
    14286, 4255), `31-RESECTION3_1` = c(4330, 832, 4446, 6208, 
    13131, 8312), `31-42TRG2R2_1` = c(6745, 1007, 8185, 9790, 
    11478, 5764), `31-3TRG4R2_1` = c(4080, 769, 14648, 8343, 
    14580, 5035), `31-35TRG4R2_1` = c(5290, 684, 11139, 9057, 
    18928, 5325), `31-12TRG4R1_1` = c(7812, 1326, 5750, 9204, 
    12352, 5489), `31-40TRG1R1_1` = c(14399, 922, 13135, 20975, 
    50560, 9952), `31-30TRG5R1_1` = c(24261, 1986, 14361, 26032, 
    89023, 20479), `31-30TRG5R2_1` = c(24454, 2196, 16827, 27964, 
    93862, 21350), `31-34TRG3R1_1` = c(37303, 2998, 22122, 30431, 
    51981, 11737), `31-RESECTION3_2` = c(20019, 1158, 8349, 14393, 
    49299, 17807), `31-7TRG3R1_1` = c(22085, 1431, 12012, 15730, 
    40418, 10271), `31-43TRG3R1_1` = c(11531, 676, 13557, 6762, 
    33468, 5563), `31-11TRG4R1_1` = c(28845, 3840, 28234, 45017, 
    85101, 17277), `31-cells_1` = c(5252, 2077, 6668, 36973, 
    39909, 15035), `31-40TRG1R2_1` = c(43316, 3278, 40684, 65687, 
    157910, 29137), `31-39TRG3R1_1` = c(28273, 2169, 14697, 20890, 
    68353, 25293), `31-RESECTION3_3` = c(27828, 2144, 13246, 
    20487, 69043, 26125), `31-7TRG3R2_1` = c(10299, 879, 6798, 
    8131, 19650, 5182), `31-43TRG3R2_1` = c(16109, 918, 18933, 
    8892, 43115, 6850), `31-11TRG4R2_1` = c(20847, 2748, 22810, 
    32811, 60942, 11978), `31-cells_2` = c(8909, 3453, 10518, 
    49209, 46428, 19275), `31-41TRG1R1_1` = c(12732, 883, 11319, 
    12157, 40460, 10400), `31-8TRG1R1_1` = c(11408, 1007, 11396, 
    20424, 38188, 9570), `31-1TRG3R1_1` = c(9928, 1112, 5726, 
    6227, 12942, 3644), `31-RNA_1` = c(7453, 1324, 4461, 15932, 
    47845, 13711), `31-10TRG3R1_1` = c(13214, 1355, 13559, 14508, 
    31296, 9096), `31-31TRG4R1_1` = c(7230, 769, 11198, 11052, 
    39781, 12827), `31-45TRG3R1_1` = c(12182, 573, 2818, 3699, 
    4365, 1639), `31-cells_3` = c(6517, 1899, 5144, 23862, 30177, 
    8443), `31-41TRG1R2_1` = c(15229, 1485, 15090, 15091, 54369, 
    13341), `31-13TRG1R1_1` = c(15332, 1162, 18268, 20875, 62257, 
    10614), `31-4TRG3R1_1` = c(9947, 1115, 7267, 5957, 13831, 
    2793), `31-RNA_2` = c(10629, 2135, 8014, 27834, 92632, 25041
    ), `31-10TRG3R2_1` = c(18232, 2010, 19637, 19973, 46075, 
    12244), `31-31TRG4R2_1` = c(6362, 1110, 14307, 12233, 44153, 
    13017), `31-47TRG3R1_1` = c(14419, 1927, 7350, 10375, 15736, 
    3415), `31-RNA_3` = c(11938, 1911, 5565, 16496, 49358, 17899
    )), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
>

Some columns are duplicated, for instance

31-30TRG5R1_1 and 31-30TRG5R2_1 are duplications of an individual sample for which I want to take average of these and create a column named 31-30TRG5

In my data I have many of these cases for these the first part of name is the same with R1_1 and R2_1

Like picture

[![enter image description here][1]][1]

Is it possible to have R function to take average of these samples instead of manually locate them and taking average over genes for them?

Upvotes: 0

Views: 57

Answers (2)

Ronak Shah
Ronak Shah

Reputation: 388982

You can use split.default to split data based on similarity of their column names and take average.

result <- cbind(my_data[1], round(sapply(split.default(my_data[-1], 
             sub('\\d_\\d$', '', names(my_data)[-1])), rowMeans, na.rm = TRUE)))

Using sub we try to keep only the common part of the column names.

sub('\\d_\\d$', '', names(my_data)[-1])
# [1] "31-29TRG1R"   "31-25TRG2R"   "31-18TRG1R"   "31-44TRG2R"   "31-32TRG2R"   "31-21TRG3R"  
# [7] "31-14TRG4R"   "31-19TRG4R"   "31-29TRG1R"   "31-25TRG2R"   "31-23TRG1R"   "31-RESECTION"
#[13] "31-32TRG2R"   "31-21TRG3R"   "31-14TRG4R"   "31-24TRG4R"   "31-46TRG1R"   "31-27TRG2R"  
#[19] "31-33TRG1R"   "31-RESECTION" "31-53TRG2R"   "31-22TRG3R"   "31-6TRG4R"    "31-17TRG3R"  
#[25] "31-46TRG1R"   "31-27TRG2R"   "31-5TRG2R"    "31-RESECTION" "31-53TRG2R"   "31-22TRG3R"  
#[31] "31-6TRG4R"    "31-16TRG4R"   "31-9TRG1R"    "31-26TRG3R"   "31-20TRG2R"   "31-RESECTION"
#[37] "31-52TRG2R"   "31-38TRG3R"   "31-49TRG4R"   "31-15TRG4R"   "31-9TRG1R"    "31-26TRG3R"  
#[43] "31-37TRG2R"   "31-RESECTION" "31-52TRG2R"   "31-38TRG3R"   "31-49TRG4R"   "31-48TRG2R"  
#[49] "31-2TRG1R"    "31-36TRG3R"   "31-51TRG2R"   "31-RESECTION" "31-42TRG2R"   "31-3TRG4R"   
#[55] "31-35TRG4R"   "31-50TRG4R"   "31-2TRG1R"    "31-36TRG3R"   "31-28TRG4R"   "31-RESECTION"
#[61] "31-42TRG2R"   "31-3TRG4R"    "31-35TRG4R"   "31-12TRG4R"   "31-40TRG1R"   "31-30TRG5R"  
#[67] "31-30TRG5R"   "31-34TRG3R"   "31-RESECTION" "31-7TRG3R"    "31-43TRG3R"   "31-11TRG4R"  
#[73] "31-cells_1"   "31-40TRG1R"   "31-39TRG3R"   "31-RESECTION" "31-7TRG3R"    "31-43TRG3R"  
[#79] "31-11TRG4R"   "31-cells_2"   "31-41TRG1R"   "31-8TRG1R"    "31-1TRG3R"    "31-RNA_1"    
[#85] "31-10TRG3R"   "31-31TRG4R"   "31-45TRG3R"   "31-cells_3"   "31-41TRG1R"   "31-13TRG1R"  
#[91] "31-4TRG3R"    "31-RNA_2"     "31-10TRG3R"   "31-31TRG4R"   "31-47TRG3R"   "31-RNA_3"    

Here column number 1 and column number 10 form 1 group and similarly for other such groups.

Upvotes: 1

r2evans
r2evans

Reputation: 160437

I'm going to first "fix" the corrupted data, since you can't have two rows of column headers. The best fix is to begin with properly imported data, perhaps skipping one of the header rows when you read it in. Lacking that, ...

This makes the assumption that `Sample Name` is the only properly "string" data, and all others are strictly numeric.

library(dplyr)
colnames(my_data) <- my_data[1,]
my_data <- my_data[-1,]
my_data <- mutate_at(my_data, vars(-`Sample Name`), as.numeric)

From here, I'll pivot the data into a "longer" format, group by the stem of the sample name, summarize, and then un-pivot back into the "wider" format. (If you're going to stay with "tidy" principles, including the use of ggplot2, it is almost certainly better to keep your data in the "longer" format. Over to you!)

my_data2 <- my_data %>%
  tidyr::pivot_longer(., -`Sample Name`, names_to = "sample", values_to = "value") %>%
  mutate(sample0 = gsub("[_0-9]+$", "", sample)) %>%
  group_by(`Sample Name`, sample0) %>%
  summarize(
    sample = sample[1], # arbitrarily choosing the first sample name
    value = mean(value)
  ) %>%
  ungroup() %>%
  select(-sample0) %>%
  tidyr::pivot_wider(names_from = "sample", values_from = "value")
my_data2
# # A tibble: 5 x 57
#   `Sample Name` `31-10TRG3R1_1` `31-11TRG4R1_1` `31-12TRG4R1_1` `31-13TRG1R1_1` `31-14TRG4R1_1`
#   <chr>                   <dbl>           <dbl>           <dbl>           <dbl>           <dbl>
# 1 ACTB                   15723           24846             7812           15332           7644.
# 2 ATP5F1                  1682.           3294             1326            1162           1435 
# 3 DDX5                   16598           25522             5750           18268          16822.
# 4 EEF1G                  17240.          38914             9204           20875          13731 
# 5 GAPDH                  38686.          73022.           12352           62257          26244.
# # ... with 51 more variables: `31-15TRG4R1_1` <dbl>, `31-16TRG4R1_1` <dbl>,
# #   `31-17TRG3R1_1` <dbl>, `31-18TRG1R1_1` <dbl>, `31-19TRG4R1_1` <dbl>, `31-1TRG3R1_1` <dbl>,
# #   `31-20TRG2R1_1` <dbl>, `31-21TRG3R1_1` <dbl>, `31-22TRG3R1_1` <dbl>, `31-23TRG1R1_1` <dbl>,
# #   `31-24TRG4R1_1` <dbl>, `31-25TRG2R1_1` <dbl>, `31-26TRG3R1_1` <dbl>, `31-27TRG2R1_1` <dbl>,
# #   `31-28TRG4R1_1` <dbl>, `31-29TRG1R1_1` <dbl>, `31-2TRG1R1_1` <dbl>, `31-30TRG5R1_1` <dbl>,
# #   `31-31TRG4R1_1` <dbl>, `31-32TRG2R1_1` <dbl>, `31-33TRG1R1_1` <dbl>, `31-34TRG3R1_1` <dbl>,
# #   `31-35TRG4R1_1` <dbl>, `31-36TRG3R1_1` <dbl>, `31-37TRG2R1_1` <dbl>, `31-38TRG3R1_1` <dbl>,
# #   `31-39TRG3R1_1` <dbl>, `31-3TRG4R1_1` <dbl>, `31-40TRG1R1_1` <dbl>, `31-41TRG1R1_1` <dbl>,
# #   `31-42TRG2R1_1` <dbl>, `31-43TRG3R1_1` <dbl>, `31-44TRG2R1_1` <dbl>, `31-45TRG3R1_1` <dbl>,
# #   `31-46TRG1R1_1` <dbl>, `31-47TRG3R1_1` <dbl>, `31-48TRG2R1_1` <dbl>, `31-49TRG4R1_1` <dbl>,
# #   `31-4TRG3R1_1` <dbl>, `31-50TRG4R1_1` <dbl>, `31-51TRG2R1_1` <dbl>, `31-52TRG2R1_1` <dbl>,
# #   `31-53TRG2R1_1` <dbl>, `31-5TRG2R1_1` <dbl>, `31-6TRG4R1_1` <dbl>, `31-7TRG3R1_1` <dbl>,
# #   `31-8TRG1R1_1` <dbl>, `31-9TRG1R1_1` <dbl>, `31-cells_1` <dbl>, `31-RESECTION1_1` <dbl>,
# #   `31-RNA_1` <dbl>

As a demonstration of what was removed, we'll diff the column names, where these listed here have been removed (averaged into the original sample name):

setdiff(names(my_data), names(my_data2))
#  [1] "31-29TRG1R2_1"   "31-25TRG2R2_1"   "31-32TRG2R2_1"   "31-21TRG3R2_1"   "31-14TRG4R2_1"  
#  [6] "31-RESECTION1_2" "31-46TRG1R2_1"   "31-27TRG2R2_1"   "31-RESECTION1_3" "31-53TRG2R2_1"  
# [11] "31-22TRG3R2_1"   "31-6TRG4R2_1"    "31-RESECTION2_1" "31-9TRG1R2_1"    "31-26TRG3R2_1"  
# [16] "31-RESECTION2_2" "31-52TRG2R1_2"   "31-38TRG3R2_1"   "31-49TRG4R2_1"   "31-RESECTION2_3"
# [21] "31-2TRG1R2_1"    "31-36TRG3R2_1"   "31-RESECTION3_1" "31-42TRG2R2_1"   "31-3TRG4R2_1"   
# [26] "31-35TRG4R2_1"   "31-30TRG5R2_1"   "31-RESECTION3_2" "31-40TRG1R2_1"   "31-RESECTION3_3"
# [31] "31-7TRG3R2_1"    "31-43TRG3R2_1"   "31-11TRG4R2_1"   "31-cells_2"      "31-cells_3"     
# [36] "31-41TRG1R2_1"   "31-RNA_2"        "31-10TRG3R2_1"   "31-31TRG4R2_1"   "31-RNA_3"       

Upvotes: 1

Related Questions