Reputation: 5907
I am working in R.
I have the following 5 data sets (data_1, data_2, data_3, data_4, data_5):
v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015")
v2 <- c("A", "B", "C", "D", "E")
data_1 = data.frame(var_1 = rnorm(871, 10,10), var_2 = rnorm(871, 5,5))
data_1$dates <- as.factor(sample(v1, 871, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_1$types <- as.factor(sample(v2, 871, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_2 = data.frame(var_1 = rnorm(412, 10,10), var_2 = rnorm(412, 5,5))
data_2$dates <- as.factor(sample(v1, 412, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_2$types <- as.factor(sample(v2, 412, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_3 = data.frame(var_1 = rnorm(332, 10,10), var_2 = rnorm(332, 5,5))
data_3$dates <- as.factor(sample(v1, 332, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_3$types <- as.factor(sample(v2, 332, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_4 = data.frame(var_1 = rnorm(611, 10,10), var_2 = rnorm(611, 5,5))
data_4$dates <- as.factor(sample(v1, 611, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_4$types <- as.factor(sample(v2, 611, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
data_5 = data.frame(var_1 = rnorm(789, 10,10), var_2 = rnorm(789, 5,5))
data_5$dates <- as.factor(sample(v1, 789, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))
data_5$types <- as.factor(sample(v2, 789, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))
# sample of one of the files
head(data_1)
var_1 var_2 dates types
1 8.523382 4.945344 2010-2011 E
2 14.137515 3.223525 2012-2013 A
3 19.610770 7.762698 2011-2012 D
4 11.334196 10.879946 2012-2013 E
5 -1.406475 2.498347 2011-2012 E
6 11.116458 9.988073 2011-2012 E
Based on the above data, I made a table that provides summaries:
summary_table = data.frame(names = c("data_1", "data_2", "data_3", "data_4", "data_5"),
counts = c(nrow(data_1), nrow(data_2), nrow(data_3), nrow(data_4), nrow(data_5) ),
mean_var_1 = c(mean(data_1$var_1), mean(data_2$var_1), mean(data_3$var_1), mean(data_4$var_1), mean(data_5$var_1)),
mean_var_2 = c(mean(data_2$var_1), mean(data_2$var_2), mean(data_3$var_2), mean(data_4$var_2), mean(data_5$var_2))
)
names counts mean_var_1 mean_var_2
1 data_1 871 9.426475 9.853399
2 data_2 412 9.853399 4.680188
3 data_3 332 10.275049 5.256084
4 data_4 611 10.094421 5.323108
5 data_5 789 9.960050 4.946458
I would like to add 5 new columns to the above tables that contain the counts per year . This would look something like this (this an empty template):
df <- data.frame(matrix(ncol = 7, nrow = 0))
x <- c("names", "counts", "counts 2010-2011", "counts 2011-2012", "counts 2012-2013", "counts 2013-2014", "counts 2014-2015")
colnames(df) <- x
I know how to do this manually, but it would take a long time:
library(dplyr)
summary_1 = data.frame( data_1 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_2 = data.frame( data_2 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_3 = data.frame( data_3 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_4 = data.frame( data_4 %>% group_by(dates) %>% summarise(my_counts = n()) )
summary_5 = data.frame( data_5 %>% group_by(dates) %>% summarise(my_counts = n()) )
#view sample of output
summary_1
dates my_counts
1 2010-2011 407
2 2011-2012 189
3 2012-2013 79
4 2013-2014 101
5 2014-2015 95
But I would then have to manually create 5 new columns and manually copy these 25 counts ( 5 x 5 = 25).
Can someone please show me a faster way to do this?
Thanks!
Example of Desired Output
Upvotes: 1
Views: 189
Reputation: 6132
Perhaps you could consider to use set.seed()
next time when using random values, so someone who is answering might use the exact same sampled data. Also, you could consider to use _
instead -
signs when defining V1 to avoid the backticks in the resulting dataframe, which are added since R does not like minus-signs in column names.
You could use a named list to have values like data_1, data_2 etc in the data_nr column instead of 1, 2 etc.
library(tidyverse)
list(data_1, data_2, data_3, data_4, data_5) %>%
set_names(paste0("data_", 1:length(.))) %>%
bind_rows(.id = "data_nr") %>%
count(data_nr, dates, name = "my_counts") %>%
pivot_wider(names_from = dates, values_from = my_counts, names_prefix = "counts_")
data_nr `counts_2010-2011` `counts_2011-2012` `counts_2012-2013` `counts_2013-2014` `counts_2014-2015`
<chr> <int> <int> <int> <int> <int>
1 data_1 437 161 93 88 92
2 data_2 218 68 40 36 50
3 data_3 170 58 35 34 35
4 data_4 331 114 65 54 47
5 data_5 398 146 89 78 78
Upvotes: 2