Reputation: 343
I have a dataframe for gene expression data. Samples are named as Genotype_Time_Replicate (e.g. AOX_1h_4).
E.g. data set
df <- structure(list(ID = c("AT5G54740.1", "AT5G55730.2", "AT5G57655.2", "AT5G64100.1", "AT5G64260.1", "AT5G67360.1", "AT1G30630.1", "AT1G62380.1", "AT1G70830.1", "AT3G14990.1", "AT4G18800.1", "AT4G24510.1", "AT5G15650.1", "AT5G19820.1", "AT5G59840.1", "AT5G47200.1", "AT1G12840.1", "AT1G76030.1", "AT1G78900.2", "AT3G42050.1", "AT4G11150.1", "AT1G11860.2", "AT1G17290.1" ),
Location = c("extracellular", "extracellular", "extracellular", "extracellular", "extracellular", "extracellular", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "golgi", "ER", "ER", "ER", "mitochondrion", "mitochondrion", "mitochondrion", "mitochondrion", "mitochondrion"),
AOX_1h_1 = c(0.844651873, 0.50954096, 1.12e-08, 0.012981372, 0.978148381, 0.027579578, 0.068010151, 0.410629215, 0.253838635, 0.033631788, 0.335713512, 0.982799013, 0.025910457, 0.793810264, 0.762431665, 0.152154436, 0.027114103, 0.000227, 1.07e-05, 0.721209032, 0.086281162, 0.483130711, 0.014795515),
AOX_1h_2 = c(0.894623378, 0.011521413, 1.62e-06, 0.085249729, 0.02863972, 0.956962154, 0.225208718, 0.932679767, 0.002574192, 0.071700671, 0.233682544, 0.936572874, 1.12e-05, 0.241658735, 0.865205515, 0.000537, 0.103471292, 8.66e-07, 1.22e-08, 0.950878446, 0.145012176, 0.092919172, 0.599713247),
AOX_1h_3 = c(0.880951025, 0.00145276, 8.59e-10, 0.087023475, 0.675527672, 0.765543306, 0.305860948, 0.899172011, 0.020973476, 0.542988545, 0.735571562, 0.157569324, 0.025488075, 0.071006507, 0.262324019, 0.080470612, 0.0436526, 6.65e-09, 5.63e-10, 0.020557091, 0.069577215, 0.005502212, 0.852099232),
AOX_1h_4 = c(0.980823252, 0.158123518, 0.00210702, 0.006317657, 0.30496173, 0.489709702, 0.091469807, 0.958443361, 0.015583593, 0.566165972, 0.66746161, 0.935102341, 0.087733288, 0.744313619, 0.021169383, 0.633250945, 0.257489406, 0.024345088, 0.000355, 0.226279179, 0.004038493, 0.479275204, 0.703522761),
AOX_2h_1 = c(0.006474022, 0.246530998, 5.38e-06, 0.47169153, 0.305973663, 0.466202566, 0.191733645, 0.016121487, 0.234839116, 0.043866023, 0.089819656, 0.107934599, 2.09e-06, 0.413229678, 0.464078018, 0.004118766, 0.774970986, 3.79e-07, 2.3e-10, 0.428591262, 0.002326292, 0.385580707, 0.106216066),
AOX_2h_2 = c(0.166169729, 0.005721199, 7.77e-08, 0.099146712, 0.457164663, 0.481987525, 7.4e-05, 0.969805081, 0.100894997, 0.062103337, 0.095718425, 0.001686206, 0.009710516, 0.134651787, 0.887036569, 0.459218152, 0.074576369, 3.88e-09, 3.31e-15, 0.409645805, 0.064874307, 0.346371524, 0.449444779),
AOX_2h_3 = c(1.06e-05, 0.576589898, 4.03e-08, 0.787468189, 0.971119601, 0.432593753, 0.000274, 0.86932399, 0.08657663, 4.22e-06, 0.071190008, 0.697384316, 0.161623604, 0.422628778, 0.299545652, 0.767867006, 0.00295567, 0.078724176, 4.33e-09, 0.988576028, 0.080278831, 0.66505527, 0.014158693),
AOX_2h_4 = c(0.010356719, 0.026506539, 9.48e-09, 0.91009296, 0.302464488, 0.894377768, 0.742233323, 0.75032613, 0.175841127, 0.000721, 0.356904918, 0.461234653, 1.08e-05, 0.65800831, 0.360085919, 0.004814238, 0.174670947, 0.004246734, 7.31e-11, 0.778725214, 0.051334623, 0.10212841, 0.155831664 ),
AOX_6h_1 = c(0.271681878, 0.004822226, 1.87e-11, 0.616969208, 0.158860224, 0.684690326, 0.011798791, 0.564591916, 0.000314, 4.79e-06, 0.299871385, 0.001909713, 0.00682428, 0.039107415, 0.574143284, 0.061532691, 0.050483892, 2.28e-08, 1.92e-12, 0.058747794, 0.027147473, 0.196608218, 0.513693112),
AOX_6h_2 = c(5.72e-12, 0.719814288, 0.140016259, 0.927094438, 0.841229414, 0.224510089, 0.026567282, 0.242981965, 0.459311076, 0.038295888, 0.127935565, 0.453746728, 0.005023732, 0.554532387, 0.280899096, 0.336458018, 0.002024021, 0.793915731, 0.012838565, 0.873716549, 0.10097853, 0.237426815, 0.003711539),
AOX_6h_3 = c(3.16e-12, 0.780424491, 0.031315419, 0.363891436, 0.09562579, 0.104833988, 3.52e-05, 0.104196756, 0.870952423, 0.002036134, 0.016480622, 0.671475063, 2.3e-05, 0.00256744, 0.66263641, 0.005026601, 0.57280276, 0.058724117, 6.4e-10, 0.030965264, 0.005301006, 0.622027012, 0.371659724),
AOX_6h_4 = c(7.99e-10, 0.290847169, 0.001319424, 0.347344795, 0.743846306, 0.470908425, 0.00033, 0.016149973, 0.080036584, 0.020899676, 0.00723071, 0.187288769, 0.042514886, 0.00150443, 0.059344154, 0.06554177, 0.112601764, 0.000379, 2.36e-10, 0.78131093, 0.105861995, 0.174370801, 0.05570041 ),
WT_1h_1 = c(0.857, 0.809, 2.31e-05, 0.286, 0.87, 0.396, 0.539, 0.787, 0.73, 0.427, 0.764, 0.87, 0.386, 0.852, 0.848, 0.661, 0.393, 0.0415, 0.00611, 0.843, 0.576, 0.804, 0.304 ),
WT_1h_2 = c(0.898, 0.509, 0.0192, 0.729, 0.616, 0.902, 0.811, 0.9, 0.343, 0.712, 0.814, 0.901, 0.0446, 0.816, 0.896, 0.217, 0.747, 0.0143, 0.000964, 0.901, 0.776, 0.737, 0.876 ),
WT_1h_3 = c(0.939, 0.627, 0.0104, 0.867, 0.932, 0.935, 0.91, 0.939, 0.803, 0.926, 0.934, 0.888, 0.813, 0.859, 0.905, 0.864, 0.838, 0.0223, 0.00917, 0.802, 0.858, 0.724, 0.938 ),
WT_1h_4 = c(0.911, 0.782, 0.298, 0.396, 0.837, 0.871, 0.727, 0.91, 0.506, 0.88, 0.89, 0.909, 0.723, 0.896, 0.547, 0.887, 0.824, 0.566, 0.175, 0.814, 0.348, 0.869, 0.893),
WT_2h_1 = c(0.748, 0.911, 0.231, 0.929, 0.917, 0.928, 0.903, 0.801, 0.909, 0.849, 0.878, 0.884, 0.183, 0.925, 0.928, 0.719, 0.941, 0.108, 0.00817, 0.926, 0.678, 0.923, 0.884),
WT_2h_2 = c(0.935, 0.851, 0.163, 0.925, 0.951, 0.952, 0.63, 0.963, 0.926, 0.916, 0.925, 0.804, 0.868, 0.931, 0.961, 0.951, 0.92, 0.0706, 0.000265, 0.95, 0.917, 0.947, 0.951),
WT_2h_3 = c(0.0197, 0.894, 0.000613, 0.911, 0.922, 0.877, 0.122, 0.916, 0.739, 0.0125, 0.718, 0.905, 0.801, 0.875, 0.852, 0.91, 0.302, 0.729, 0.00015, 0.923, 0.731, 0.902, 0.504),
WT_2h_4 = c(0.696, 0.765, 0.0142, 0.931, 0.893, 0.931, 0.925, 0.925, 0.87, 0.45, 0.899, 0.908, 0.144, 0.921, 0.899, 0.631, 0.87, 0.62, 0.0014, 0.926, 0.807, 0.844, 0.865),
WT_6h_1 = c(0.898, 0.727, 0.00395, 0.921, 0.881, 0.924, 0.776, 0.919, 0.542, 0.234, 0.901, 0.67, 0.747, 0.83, 0.919, 0.848, 0.841, 0.056, 0.00144, 0.846, 0.815, 0.888, 0.916),
WT_6h_2 = c(2.38e-09, 0.88, 0.708, 0.898, 0.891, 0.768, 0.443, 0.777, 0.843, 0.505, 0.695, 0.842, 0.208, 0.859, 0.794, 0.813, 0.14, 0.887, 0.326, 0.894, 0.661, 0.775, 0.182),
WT_6h_3 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
WT_6h_4 = c(0.0357, 0.953, 0.792, 0.956, 0.967, 0.96, 0.711, 0.892, 0.931, 0.899, 0.866, 0.946, 0.917, 0.799, 0.925, 0.927, 0.938, 0.72, 0.025, 0.967, 0.936, 0.945, 0.923)),
class = "data.frame", row.names = c(NA, -23L))
I want to summarize data for each organelle (averaged by organelle and samples' replicates) and plot the Wildtype and mutant data side by side with standard error for each time point
df <-
melted <- melt(df)
head(melted)
melted$variable<- str_replace_all(melted$variable, '_[0-9]$', '')
melted$variable <- factor(melted$variable,levels=c("WT_1h","AOX_1h","WT_2h","AOX_2h","WT_6h","AOX_6h"))
my_comparisons <- list( c("WT_1h","AOX_1h"), c("WT_2h","AOX_2h"),c("WT_6h","AOX_6h"))
ggbarplot(melted, x = "variable", y = "value", add = "mean_se",
color = "variable", palette = c("grey","black","grey","black","grey","black"),
facet.by = "Location")+
stat_compare_means(comparisons = my_comparisons, label = "p.signif")
How can I use tidyverse
(dplyr
/ tidyr
) for this purpose?
How can I use tidyverse
(dplyr
/ tidyr
) to follow this pathway instead of above scripts?
Upvotes: 2
Views: 110
Reputation: 347
Another version going from the df
object:
The df
object is a list, and expression values after cbind are character type, so you can do
tb <- as_tibble(do.call(cbind, df)) %>%
mutate_at(3:14, as.numeric)
NB that usually for gene expression data it is easier to read in count data using read_tsv
or read.table
and combine into matrix
, data.frame
or tibble
.
NBB the df
object specified has no "WT" samples (from my copy/paste anyway) so I renamed last 4 samples in tb
as "WT_1h" replicates
colnames(tb)[11:14] <- paste0("WT_1h_",c(1:4))
Create means from replicates by function
rowMeanNrep <- function(tb, nm){
varname <- paste0(nm, "_mean")
selectn <- grep(nm, colnames(tb))
tb %>%
dplyr::mutate(!!varname := rowMeans(dplyr::select(., !!selectn)))
}
Specify which timepoints to use, and apply
tps <- c("AOX_1h", "WT_1h")
tb_1h_mean <- cbind(tb_1h[,1:2],
do.call(cbind, lapply(tps, function(f){
rowMeanNrep(tb=tb, nm=f) %>%
dplyr::select(paste0(f, "_mean"))
}))
)
A final NB, think about using boxplots instead of barplots, see this paper
Upvotes: 2
Reputation: 6020
You can use different functions to normalise this data. I use gather()
in this example alongside stringr
functions to extract the data from the character vector that has 3 columns of data in it.
dat %>%
gather(key, value, -ID, -Location) %>%
mutate(type = map_chr(str_split(key,"_"),~.x[1]),
hour = map_chr(str_split(key,"_"),~.x[2]),
n = map_chr(str_split(key,"_"),~.x[3])) %>%
group_by(type, hour) %>%
summarise(mean = mean(value))
Gives
# A tibble: 6 x 3
# Groups: type [?]
type hour mean
<chr> <chr> <dbl>
1 AOX 1h 0.3235302
2 AOX 2h 0.2709910
3 AOX 6h 0.2226648
4 WT 1h 0.6633866
5 WT 2h 0.7263108
6 WT 6h 0.7915662
This you can use in ggplot()
to make a nice barplot.
To get it in a table you can use
dat %>%
gather(key, value, -ID, -Location) %>%
mutate(type = map_chr(str_split(key,"_"),~.x[1]),
hour = map_chr(str_split(key,"_"),~.x[2]),
n = map_chr(str_split(key,"_"),~.x[3])) %>%
group_by(type, hour) %>%
summarise(mean = mean(value)) %>%
spread(type, mean)
to get
# A tibble: 3 x 3
hour AOX WT
* <chr> <dbl> <dbl>
1 1h 0.3235302 0.6633866
2 2h 0.2709910 0.7263108
3 6h 0.2226648 0.7915662
Upvotes: 3