stats_noob
stats_noob

Reputation: 5907

Creating a New Variable for Each Unique Group

I am working in R.

I have the following 5 data sets (data_1, data_2, data_3, data_4, data_5):

v1 <- c("2010-2011","2011-2012", "2012-2013", "2013-2014", "2014-2015") 
v2 <- c("A", "B", "C", "D", "E")

data_1 = data.frame(var_1 = rnorm(871, 10,10), var_2 = rnorm(871, 5,5))

data_1$dates <- as.factor(sample(v1, 871, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_1$types <- as.factor(sample(v2, 871, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_2 = data.frame(var_1 = rnorm(412, 10,10), var_2 = rnorm(412, 5,5))

data_2$dates <- as.factor(sample(v1, 412, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_2$types <- as.factor(sample(v2, 412, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_3 = data.frame(var_1 = rnorm(332, 10,10), var_2 = rnorm(332, 5,5))

data_3$dates <- as.factor(sample(v1, 332, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_3$types <- as.factor(sample(v2, 332, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_4 = data.frame(var_1 = rnorm(611, 10,10), var_2 = rnorm(611, 5,5))

data_4$dates <- as.factor(sample(v1, 611, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_4$types <- as.factor(sample(v2, 611, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))


data_5 = data.frame(var_1 = rnorm(789, 10,10), var_2 = rnorm(789, 5,5))

data_5$dates <- as.factor(sample(v1, 789, replace=TRUE, prob=c(0.5, 0.2, 0.1, 0.1, 0.1)))

data_5$types <- as.factor(sample(v2, 789, replace=TRUE, prob=c(0.3, 0.2, 0.1, 0.1, 0.1)))

# sample of one of the files

head(data_1)
      var_1     var_2     dates types
1  8.523382  4.945344 2010-2011     E
2 14.137515  3.223525 2012-2013     A
3 19.610770  7.762698 2011-2012     D
4 11.334196 10.879946 2012-2013     E
5 -1.406475  2.498347 2011-2012     E
6 11.116458  9.988073 2011-2012     E

Based on the above data, I made a table that provides summaries:

summary_table = data.frame(names = c("data_1", "data_2", "data_3", "data_4", "data_5"),
 counts = c(nrow(data_1), nrow(data_2), nrow(data_3), nrow(data_4), nrow(data_5) ),
mean_var_1 = c(mean(data_1$var_1), mean(data_2$var_1), mean(data_3$var_1), mean(data_4$var_1), mean(data_5$var_1)),
mean_var_2 = c(mean(data_2$var_1), mean(data_2$var_2), mean(data_3$var_2), mean(data_4$var_2), mean(data_5$var_2))

)


   names counts mean_var_1 mean_var_2
1 data_1    871   9.426475   9.853399
2 data_2    412   9.853399   4.680188
3 data_3    332  10.275049   5.256084
4 data_4    611  10.094421   5.323108
5 data_5    789   9.960050   4.946458

I would like to add 5 new columns to the above tables that contain the counts per year . This would look something like this (this an empty template):

df <- data.frame(matrix(ncol = 7, nrow = 0))
x <- c("names", "counts", "counts 2010-2011", "counts 2011-2012", "counts 2012-2013", "counts 2013-2014", "counts 2014-2015")
colnames(df) <- x

I know how to do this manually, but it would take a long time:

    library(dplyr)
    
     summary_1 = data.frame( data_1 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_2 = data.frame( data_2 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_3 = data.frame( data_3 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_4 = data.frame( data_4 %>%  group_by(dates) %>% summarise(my_counts = n()) )
     summary_5 = data.frame( data_5 %>%  group_by(dates) %>% summarise(my_counts = n()) )

#view sample of output
summary_1

      dates my_counts
1 2010-2011       407
2 2011-2012       189
3 2012-2013        79
4 2013-2014       101
5 2014-2015        95

But I would then have to manually create 5 new columns and manually copy these 25 counts ( 5 x 5 = 25).

Can someone please show me a faster way to do this?

Thanks!

Example of Desired Output

enter image description here

Upvotes: 1

Views: 189

Answers (1)

Lennyy
Lennyy

Reputation: 6132

Perhaps you could consider to use set.seed() next time when using random values, so someone who is answering might use the exact same sampled data. Also, you could consider to use _ instead - signs when defining V1 to avoid the backticks in the resulting dataframe, which are added since R does not like minus-signs in column names.

You could use a named list to have values like data_1, data_2 etc in the data_nr column instead of 1, 2 etc.

library(tidyverse)

list(data_1, data_2, data_3, data_4, data_5) %>% 
  set_names(paste0("data_", 1:length(.))) %>% 
  bind_rows(.id = "data_nr") %>% 
  count(data_nr, dates, name = "my_counts") %>% 
  pivot_wider(names_from = dates, values_from = my_counts, names_prefix = "counts_")

       data_nr `counts_2010-2011` `counts_2011-2012` `counts_2012-2013` `counts_2013-2014` `counts_2014-2015`
  <chr>                <int>              <int>              <int>              <int>              <int>
1 data_1                 437                161                 93                 88                 92
2 data_2                 218                 68                 40                 36                 50
3 data_3                 170                 58                 35                 34                 35
4 data_4                 331                114                 65                 54                 47
5 data_5                 398                146                 89                 78                 78

Upvotes: 2

Related Questions