Reputation: 117
I have such data
> dput(head(my_data))
structure(list(`Sample Name` = c("ACTB", "ATP5F1", "DDX5", "EEF1G",
"GAPDH", "NCL"), `31-29TRG1R1_1` = c(4526, 884, 11044, 10661,
31760, 9417), `31-25TRG2R1_1` = c(5311, 841, 6401, 16073, 20432,
4969), `31-18TRG1R1_1` = c(12487, 567, 13945, 16474, 43309, 11831
), `31-44TRG2R1_1` = c(9477, 1244, 7140, 6580, 12457, 5176),
`31-32TRG2R1_1` = c(13386, 1535, 14153, 11036, 26186, 8299
), `31-21TRG3R1_1` = c(8511, 1703, 8966, 14800, 57140, 7002
), `31-14TRG4R1_1` = c(9935, 1841, 19590, 16467, 34281, 12463
), `31-19TRG4R1_1` = c(5211, 917, 9144, 8024, 8200, 3935),
`31-29TRG1R2_1` = c(3181, 613, 7584, 7603, 19331, 6286),
`31-25TRG2R2_1` = c(115931, 11738, 88604, 255822, 447002,
70197), `31-23TRG1R1_1` = c(3156, 765, 18034, 18982, 17237,
18880), `31-RESECTION1_1` = c(3470, 506, 8858, 11829, 29934,
7857), `31-32TRG2R2_1` = c(241372, 21547, 165886, 138726,
442723, 94629), `31-21TRG3R2_1` = c(5102, 1217, 5977, 10041,
35615, 4740), `31-14TRG4R2_1` = c(5352, 1029, 14053, 10995,
18206, 8633), `31-24TRG4R1_1` = c(6990, 761, 4440, 2833,
8150, 1340), `31-46TRG1R1_1` = c(4290, 834, 8146, 22078,
13631, 7407), `31-27TRG2R1_1` = c(11262, 1153, 10063, 13690,
19430, 6470), `31-33TRG1R1_1` = c(4139, 338, 8260, 8650,
13916, 8000), `31-RESECTION1_2` = c(2812, 497, 8503, 10674,
22677, 7394), `31-53TRG2R1_1` = c(7438, 921, 6897, 10042,
15142, 8073), `31-22TRG3R1_1` = c(4784, 621, 9904, 7049,
13977, 4279), `31-6TRG4R1_1` = c(6749, 704, 9638, 12920,
14396, 8133), `31-17TRG3R1_1` = c(10074, 1056, 7966, 12489,
26819, 6404), `31-46TRG1R2_1` = c(6586, 1160, 10305, 30012,
21850, 9848), `31-27TRG2R2_1` = c(10368, 866, 7324, 10047,
17939, 4943), `31-5TRG2R1_1` = c(5479, 440, 8913, 7103, 15886,
5801), `31-RESECTION1_3` = c(3032, 474, 8484, 11152, 24847,
7741), `31-53TRG2R2_1` = c(10587, 1315, 8675, 12923, 23882,
10213), `31-22TRG3R2_1` = c(4885, 660, 9710, 7115, 15669,
4908), `31-6TRG4R2_1` = c(6683, 650, 7987, 11447, 14364,
7236), `31-16TRG4R1_1` = c(5428, 825, 11789, 12018, 6812,
5954), `31-9TRG1R1_1` = c(5723, 494, 7504, 5591, 9086, 2966
), `31-26TRG3R1_1` = c(2564, 465, 2115, 2810, 4022, 955),
`31-20TRG2R1_1` = c(1018, 491, 2325, 3460, 3469, 1863), `31-RESECTION2_1` = c(1011,
285, 2893, 4345, 6476, 1818), `31-52TRG2R1_1` = c(4128, 622,
5952, 5196, 7428, 4181), `31-38TRG3R1_1` = c(2281, 521, 12043,
8111, 16866, 5207), `31-49TRG4R1_1` = c(4566, 453, 8521,
9618, 26104, 8663), `31-15TRG4R1_1` = c(6501, 947, 10320,
7285, 10538, 4638), `31-9TRG1R2_1` = c(13212, 1131, 14045,
11418, 21843, 6245), `31-26TRG3R2_1` = c(5832, 972, 3012,
4851, 7559, 1560), `31-37TRG2R1_1` = c(7031, 690, 8208, 17495,
28514, 7058), `31-RESECTION2_2` = c(4095, 620, 6319, 10473,
25292, 4274), `31-52TRG2R1_2` = c(10240, 2169, 19618, 17052,
18671, 13688), `31-38TRG3R2_1` = c(5009, 874, 20191, 14132,
34612, 8855), `31-49TRG4R2_1` = c(10911, 1131, 19318, 21560,
58967, 19450), `31-48TRG2R1_1` = c(8758, 1002, 8044, 6677,
17354, 7355), `31-2TRG1R1_1` = c(10461, 1904, 15286, 16249,
15900, 14393), `31-36TRG3R1_1` = c(16430, 1401, 9522, 17646,
26764, 8063), `31-51TRG2R1_1` = c(151880, 16572, 93610, 110556,
303604, 57029), `31-RESECTION2_3` = c(9686, 1435, 14038,
23464, 61018, 8921), `31-42TRG2R1_1` = c(38649, 3795, 24468,
36134, 68794, 17827), `31-3TRG4R1_1` = c(14421, 1901, 34905,
21004, 47952, 12428), `31-35TRG4R1_1` = c(59268, 4752, 73216,
64854, 212997, 37064), `31-50TRG4R1_1` = c(6660, 701, 4092,
16796, 7958, 2408), `31-2TRG1R2_1` = c(4845, 1086, 9191,
8887, 6692, 8119), `31-36TRG3R2_1` = c(8418, 868, 6832, 11279,
13530, 5233), `31-28TRG4R1_1` = c(7980, 1343, 7342, 9874,
14286, 4255), `31-RESECTION3_1` = c(4330, 832, 4446, 6208,
13131, 8312), `31-42TRG2R2_1` = c(6745, 1007, 8185, 9790,
11478, 5764), `31-3TRG4R2_1` = c(4080, 769, 14648, 8343,
14580, 5035), `31-35TRG4R2_1` = c(5290, 684, 11139, 9057,
18928, 5325), `31-12TRG4R1_1` = c(7812, 1326, 5750, 9204,
12352, 5489), `31-40TRG1R1_1` = c(14399, 922, 13135, 20975,
50560, 9952), `31-30TRG5R1_1` = c(24261, 1986, 14361, 26032,
89023, 20479), `31-30TRG5R2_1` = c(24454, 2196, 16827, 27964,
93862, 21350), `31-34TRG3R1_1` = c(37303, 2998, 22122, 30431,
51981, 11737), `31-RESECTION3_2` = c(20019, 1158, 8349, 14393,
49299, 17807), `31-7TRG3R1_1` = c(22085, 1431, 12012, 15730,
40418, 10271), `31-43TRG3R1_1` = c(11531, 676, 13557, 6762,
33468, 5563), `31-11TRG4R1_1` = c(28845, 3840, 28234, 45017,
85101, 17277), `31-cells_1` = c(5252, 2077, 6668, 36973,
39909, 15035), `31-40TRG1R2_1` = c(43316, 3278, 40684, 65687,
157910, 29137), `31-39TRG3R1_1` = c(28273, 2169, 14697, 20890,
68353, 25293), `31-RESECTION3_3` = c(27828, 2144, 13246,
20487, 69043, 26125), `31-7TRG3R2_1` = c(10299, 879, 6798,
8131, 19650, 5182), `31-43TRG3R2_1` = c(16109, 918, 18933,
8892, 43115, 6850), `31-11TRG4R2_1` = c(20847, 2748, 22810,
32811, 60942, 11978), `31-cells_2` = c(8909, 3453, 10518,
49209, 46428, 19275), `31-41TRG1R1_1` = c(12732, 883, 11319,
12157, 40460, 10400), `31-8TRG1R1_1` = c(11408, 1007, 11396,
20424, 38188, 9570), `31-1TRG3R1_1` = c(9928, 1112, 5726,
6227, 12942, 3644), `31-RNA_1` = c(7453, 1324, 4461, 15932,
47845, 13711), `31-10TRG3R1_1` = c(13214, 1355, 13559, 14508,
31296, 9096), `31-31TRG4R1_1` = c(7230, 769, 11198, 11052,
39781, 12827), `31-45TRG3R1_1` = c(12182, 573, 2818, 3699,
4365, 1639), `31-cells_3` = c(6517, 1899, 5144, 23862, 30177,
8443), `31-41TRG1R2_1` = c(15229, 1485, 15090, 15091, 54369,
13341), `31-13TRG1R1_1` = c(15332, 1162, 18268, 20875, 62257,
10614), `31-4TRG3R1_1` = c(9947, 1115, 7267, 5957, 13831,
2793), `31-RNA_2` = c(10629, 2135, 8014, 27834, 92632, 25041
), `31-10TRG3R2_1` = c(18232, 2010, 19637, 19973, 46075,
12244), `31-31TRG4R2_1` = c(6362, 1110, 14307, 12233, 44153,
13017), `31-47TRG3R1_1` = c(14419, 1927, 7350, 10375, 15736,
3415), `31-RNA_3` = c(11938, 1911, 5565, 16496, 49358, 17899
)), row.names = c(NA, -6L), class = c("tbl_df", "tbl", "data.frame"
))
>
Some columns are duplicated, for instance
31-30TRG5R1_1
and 31-30TRG5R2_1
are duplications of an individual sample for which I want to take average of these and create a column named 31-30TRG5
In my data I have many of these cases for these the first part of name is the same with R1_1
and R2_1
Like picture
[![enter image description here][1]][1]
Is it possible to have R function to take average of these samples instead of manually locate them and taking average over genes for them?
Upvotes: 0
Views: 57
Reputation: 388982
You can use split.default
to split data based on similarity of their column names and take average.
result <- cbind(my_data[1], round(sapply(split.default(my_data[-1],
sub('\\d_\\d$', '', names(my_data)[-1])), rowMeans, na.rm = TRUE)))
Using sub
we try to keep only the common part of the column names.
sub('\\d_\\d$', '', names(my_data)[-1])
# [1] "31-29TRG1R" "31-25TRG2R" "31-18TRG1R" "31-44TRG2R" "31-32TRG2R" "31-21TRG3R"
# [7] "31-14TRG4R" "31-19TRG4R" "31-29TRG1R" "31-25TRG2R" "31-23TRG1R" "31-RESECTION"
#[13] "31-32TRG2R" "31-21TRG3R" "31-14TRG4R" "31-24TRG4R" "31-46TRG1R" "31-27TRG2R"
#[19] "31-33TRG1R" "31-RESECTION" "31-53TRG2R" "31-22TRG3R" "31-6TRG4R" "31-17TRG3R"
#[25] "31-46TRG1R" "31-27TRG2R" "31-5TRG2R" "31-RESECTION" "31-53TRG2R" "31-22TRG3R"
#[31] "31-6TRG4R" "31-16TRG4R" "31-9TRG1R" "31-26TRG3R" "31-20TRG2R" "31-RESECTION"
#[37] "31-52TRG2R" "31-38TRG3R" "31-49TRG4R" "31-15TRG4R" "31-9TRG1R" "31-26TRG3R"
#[43] "31-37TRG2R" "31-RESECTION" "31-52TRG2R" "31-38TRG3R" "31-49TRG4R" "31-48TRG2R"
#[49] "31-2TRG1R" "31-36TRG3R" "31-51TRG2R" "31-RESECTION" "31-42TRG2R" "31-3TRG4R"
#[55] "31-35TRG4R" "31-50TRG4R" "31-2TRG1R" "31-36TRG3R" "31-28TRG4R" "31-RESECTION"
#[61] "31-42TRG2R" "31-3TRG4R" "31-35TRG4R" "31-12TRG4R" "31-40TRG1R" "31-30TRG5R"
#[67] "31-30TRG5R" "31-34TRG3R" "31-RESECTION" "31-7TRG3R" "31-43TRG3R" "31-11TRG4R"
#[73] "31-cells_1" "31-40TRG1R" "31-39TRG3R" "31-RESECTION" "31-7TRG3R" "31-43TRG3R"
[#79] "31-11TRG4R" "31-cells_2" "31-41TRG1R" "31-8TRG1R" "31-1TRG3R" "31-RNA_1"
[#85] "31-10TRG3R" "31-31TRG4R" "31-45TRG3R" "31-cells_3" "31-41TRG1R" "31-13TRG1R"
#[91] "31-4TRG3R" "31-RNA_2" "31-10TRG3R" "31-31TRG4R" "31-47TRG3R" "31-RNA_3"
Here column number 1 and column number 10 form 1 group and similarly for other such groups.
Upvotes: 1
Reputation: 160437
I'm going to first "fix" the corrupted data, since you can't have two rows of column headers. The best fix is to begin with properly imported data, perhaps skipping one of the header rows when you read it in. Lacking that, ...
This makes the assumption that `Sample Name`
is the only properly "string" data, and all others are strictly numeric.
library(dplyr)
colnames(my_data) <- my_data[1,]
my_data <- my_data[-1,]
my_data <- mutate_at(my_data, vars(-`Sample Name`), as.numeric)
From here, I'll pivot the data into a "longer" format, group by the stem of the sample name, summarize, and then un-pivot back into the "wider" format. (If you're going to stay with "tidy" principles, including the use of ggplot2
, it is almost certainly better to keep your data in the "longer" format. Over to you!)
my_data2 <- my_data %>%
tidyr::pivot_longer(., -`Sample Name`, names_to = "sample", values_to = "value") %>%
mutate(sample0 = gsub("[_0-9]+$", "", sample)) %>%
group_by(`Sample Name`, sample0) %>%
summarize(
sample = sample[1], # arbitrarily choosing the first sample name
value = mean(value)
) %>%
ungroup() %>%
select(-sample0) %>%
tidyr::pivot_wider(names_from = "sample", values_from = "value")
my_data2
# # A tibble: 5 x 57
# `Sample Name` `31-10TRG3R1_1` `31-11TRG4R1_1` `31-12TRG4R1_1` `31-13TRG1R1_1` `31-14TRG4R1_1`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 ACTB 15723 24846 7812 15332 7644.
# 2 ATP5F1 1682. 3294 1326 1162 1435
# 3 DDX5 16598 25522 5750 18268 16822.
# 4 EEF1G 17240. 38914 9204 20875 13731
# 5 GAPDH 38686. 73022. 12352 62257 26244.
# # ... with 51 more variables: `31-15TRG4R1_1` <dbl>, `31-16TRG4R1_1` <dbl>,
# # `31-17TRG3R1_1` <dbl>, `31-18TRG1R1_1` <dbl>, `31-19TRG4R1_1` <dbl>, `31-1TRG3R1_1` <dbl>,
# # `31-20TRG2R1_1` <dbl>, `31-21TRG3R1_1` <dbl>, `31-22TRG3R1_1` <dbl>, `31-23TRG1R1_1` <dbl>,
# # `31-24TRG4R1_1` <dbl>, `31-25TRG2R1_1` <dbl>, `31-26TRG3R1_1` <dbl>, `31-27TRG2R1_1` <dbl>,
# # `31-28TRG4R1_1` <dbl>, `31-29TRG1R1_1` <dbl>, `31-2TRG1R1_1` <dbl>, `31-30TRG5R1_1` <dbl>,
# # `31-31TRG4R1_1` <dbl>, `31-32TRG2R1_1` <dbl>, `31-33TRG1R1_1` <dbl>, `31-34TRG3R1_1` <dbl>,
# # `31-35TRG4R1_1` <dbl>, `31-36TRG3R1_1` <dbl>, `31-37TRG2R1_1` <dbl>, `31-38TRG3R1_1` <dbl>,
# # `31-39TRG3R1_1` <dbl>, `31-3TRG4R1_1` <dbl>, `31-40TRG1R1_1` <dbl>, `31-41TRG1R1_1` <dbl>,
# # `31-42TRG2R1_1` <dbl>, `31-43TRG3R1_1` <dbl>, `31-44TRG2R1_1` <dbl>, `31-45TRG3R1_1` <dbl>,
# # `31-46TRG1R1_1` <dbl>, `31-47TRG3R1_1` <dbl>, `31-48TRG2R1_1` <dbl>, `31-49TRG4R1_1` <dbl>,
# # `31-4TRG3R1_1` <dbl>, `31-50TRG4R1_1` <dbl>, `31-51TRG2R1_1` <dbl>, `31-52TRG2R1_1` <dbl>,
# # `31-53TRG2R1_1` <dbl>, `31-5TRG2R1_1` <dbl>, `31-6TRG4R1_1` <dbl>, `31-7TRG3R1_1` <dbl>,
# # `31-8TRG1R1_1` <dbl>, `31-9TRG1R1_1` <dbl>, `31-cells_1` <dbl>, `31-RESECTION1_1` <dbl>,
# # `31-RNA_1` <dbl>
As a demonstration of what was removed, we'll diff the column names, where these listed here have been removed (averaged into the original sample name):
setdiff(names(my_data), names(my_data2))
# [1] "31-29TRG1R2_1" "31-25TRG2R2_1" "31-32TRG2R2_1" "31-21TRG3R2_1" "31-14TRG4R2_1"
# [6] "31-RESECTION1_2" "31-46TRG1R2_1" "31-27TRG2R2_1" "31-RESECTION1_3" "31-53TRG2R2_1"
# [11] "31-22TRG3R2_1" "31-6TRG4R2_1" "31-RESECTION2_1" "31-9TRG1R2_1" "31-26TRG3R2_1"
# [16] "31-RESECTION2_2" "31-52TRG2R1_2" "31-38TRG3R2_1" "31-49TRG4R2_1" "31-RESECTION2_3"
# [21] "31-2TRG1R2_1" "31-36TRG3R2_1" "31-RESECTION3_1" "31-42TRG2R2_1" "31-3TRG4R2_1"
# [26] "31-35TRG4R2_1" "31-30TRG5R2_1" "31-RESECTION3_2" "31-40TRG1R2_1" "31-RESECTION3_3"
# [31] "31-7TRG3R2_1" "31-43TRG3R2_1" "31-11TRG4R2_1" "31-cells_2" "31-cells_3"
# [36] "31-41TRG1R2_1" "31-RNA_2" "31-10TRG3R2_1" "31-31TRG4R2_1" "31-RNA_3"
Upvotes: 1