Reputation: 33
My data looks like this
structure(list(year = c(2017, 2017, 2017, 2017, 2018, 2018, 2018,
2018, 2019, 2019, 2019, 2019, 2017, 2017, 2018, 2018, 2019, 2019,
2017, 2017, 2017, 2017, 2018, 2018, 2018, 2018, 2019, 2019, 2019,
2019), College = c("College1", "College1", "College1", "College1",
"College1", "College1", "College1", "College1", "College1", "College1",
"College1", "College1", "College2", "College2", "College2", "College2",
"College2", "College2", "College3", "College3", "College3", "College3",
"College3", "College3", "College3", "College3", "College3", "College3",
"College3", "College3"), Department = c("Department 1", "Department 2",
"Department 3", "Department 4", "Department 1", "Department 2",
"Department 3", "Department 4", "Department 1", "Department 2",
"Department 3", "Department 4", "Department 1", "Department 2",
"Department 1", "Department 2", "Department 1", "Department 2",
"Department 1", "Department 2", "Department 3", "Department 4",
"Department 1", "Department 2", "Department 3", "Department 4",
"Department 1", "Department 2", "Department 3", "Department 4"
), Enrollment = c(51L, 322L, 251L, 106L, 468L, 205L, 718L, 200L,
344L, 256L, 434L, 38L, 487L, 503L, 14L, 448L, 489L, 437L, 695L,
833L, 941L, 299L, 864L, 888L, 531L, 335L, 47L, 753L, 319L, 986L
)), row.names = c(NA, -30L), class = c("tbl_df", "tbl", "data.frame"
))
I am having trouble calculating the enrollment change throughout the years for each College and then for each Department. So technique I want one new column that tells me at the College level what is the enrollment change, and one new column that tells me at the Department level.
I have tried many different things and this is the closest i got
growth_rate = df %>%
arrange(College, Department)%>%
group_by(College) %>%
mutate(Diff_growth = Enrollment - lag(Enrollment),
Rate_percent = Diff_growth/Enrollment)
But this code doesn't stop at the beginning of each cycle, which is the year 2017, it just keeps going down the rows. Any help would be greatly appreciated.
The desired output would look like this
structure(list(X = 1:30, year = c(2017L, 2017L, 2017L, 2017L,
2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 2019L, 2019L, 2017L,
2017L, 2018L, 2018L, 2019L, 2019L, 2017L, 2017L, 2017L, 2017L,
2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 2019L, 2019L), College =
c("College1",
"College1", "College1", "College1", "College1", "College1",
"College1",
"College1", "College1", "College1", "College1", "College1",
"College2",
"College2", "College2", "College2", "College2", "College2",
"College3",
"College3", "College3", "College3", "College3", "College3",
"College3",
"College3", "College3", "College3", "College3", "College3"),
Department = c("Department 1", "Department 2", "Department 3",
"Department 4", "Department 1", "Department 2", "Department 3",
"Department 4", "Department 1", "Department 2", "Department 3",
"Department 4", "Department 1", "Department 2", "Department 1",
"Department 2", "Department 1", "Department 2", "Department 1",
"Department 2", "Department 3", "Department 4", "Department 1",
"Department 2", "Department 3", "Department 4", "Department 1",
"Department 2", "Department 3", "Department 4"), Enrollment = c(51L,
322L, 251L, 106L, 468L, 205L, 718L, 200L, 344L, 256L, 434L,
38L, 487L, 503L, 14L, 448L, 489L, 437L, 695L, 833L, 941L,
299L, 864L, 888L, 531L, 335L, 47L, 753L, 319L, 986L), RateChange_College = c(NA,
NA, NA, NA, 1.179452055, 1.179452055, 1.179452055, 1.179452055,
-0.326209931, -0.326209931, -0.326209931, -0.326209931, NA,
NA, -0.533333333, -0.533333333, 1.004329004, 1.004329004,
NA, NA, NA, NA, -0.054190751, -0.054190751, -0.054190751,
-0.054190751, -0.195951108, -0.195951108, -0.195951108, -0.195951108
), RateChange_Department = c(NA, NA, NA, NA, 8.176470588,
-0.363354037, 1.860557769, 0.886792453, -0.264957265, 0.248780488,
-0.395543175, -0.81, NA, NA, -0.971252567, -0.109343936,
33.92857143, -0.024553571, NA, NA, NA, NA, 0.243165468, 0.066026411,
-0.435706695, 0.120401338, -0.945601852, -0.152027027, -0.399246704,
1.943283582)), class = "data.frame", row.names = c(NA, -30L
))
Upvotes: 0
Views: 84
Reputation: 11006
Here's a solution that should match your desired output:
library(tidyverse)
df |>
arrange(College, year) |>
mutate(Enrollment_Year = sum(Enrollment), .by = c(College, year)) |>
mutate(RateChange_College = Enrollment_Year / lag(Enrollment_Year) - 1, .by = c(College, Department)) |>
arrange(College, Department, year) |>
mutate(RateChange_Department = Enrollment / lag(Enrollment) - 1, .by = c(College, Department)) |>
arrange(College, year, Department)
# A tibble: 30 × 7
year College Department Enrollment Enrollment_Year RateChange_College RateChange_Department
<dbl> <chr> <chr> <int> <int> <dbl> <dbl>
1 2017 College1 Department 1 51 730 NA NA
2 2017 College1 Department 2 322 730 NA NA
3 2017 College1 Department 3 251 730 NA NA
4 2017 College1 Department 4 106 730 NA NA
5 2018 College1 Department 1 468 1591 1.179452 8.176471
6 2018 College1 Department 2 205 1591 1.179452 -0.3633540
7 2018 College1 Department 3 718 1591 1.179452 1.860558
8 2018 College1 Department 4 200 1591 1.179452 0.8867925
9 2019 College1 Department 1 344 1072 -0.3262099 -0.2649573
10 2019 College1 Department 2 256 1072 -0.3262099 0.2487805
# ℹ 20 more rows
# ℹ Use `print(n = ...)` to see more rows
We first calculate the Enrollment sum per year and college in order to be able to calculate the Rate Change. We then rearrange the data by College/Department/Year and do the same Rate Change calculation (however, this time directly with the original Enrollment column).
Upvotes: 1
Reputation: 3864
You can calculate the rates for colleges and college-department combinations separately, and join
the results.
library(tidyverse)
college_rates <- df |>
arrange(year, College, Department) |>
summarize(college_yearly_enrollment = sum(Enrollment),
.by = c(year, College)) |>
mutate(RateChange_College = (college_yearly_enrollment - lag(college_yearly_enrollment))/
lag(college_yearly_enrollment),
.by = College) |>
select(-college_yearly_enrollment)
glimpse(college_rates)
#> Rows: 9
#> Columns: 3
#> $ year <dbl> 2017, 2017, 2017, 2018, 2018, 2018, 2019, 2019, 2019
#> $ College <chr> "College1", "College2", "College3", "College1", "Co…
#> $ RateChange_College <dbl> NA, NA, NA, 1.17945205, -0.53333333, -0.05419075, -…
df |>
arrange(year, College, Department) |>
left_join(college_rates, by = join_by(year, College)) |>
mutate(RateChange_Department = (Enrollment - lag(Enrollment))/lag(Enrollment),
.by = c(College, Department)) |>
# filter for reprex only
filter(year > 2017)
#> # A tibble: 20 × 6
#> year College Department Enrollment RateChange_College RateChange_Department
#> <dbl> <chr> <chr> <int> <dbl> <dbl>
#> 1 2018 College1 Departmen… 468 1.18 8.18
#> 2 2018 College1 Departmen… 205 1.18 -0.363
#> 3 2018 College1 Departmen… 718 1.18 1.86
#> 4 2018 College1 Departmen… 200 1.18 0.887
#> 5 2018 College2 Departmen… 14 -0.533 -0.971
#> 6 2018 College2 Departmen… 448 -0.533 -0.109
#> 7 2018 College3 Departmen… 864 -0.0542 0.243
#> 8 2018 College3 Departmen… 888 -0.0542 0.0660
#> 9 2018 College3 Departmen… 531 -0.0542 -0.436
#> 10 2018 College3 Departmen… 335 -0.0542 0.120
#> 11 2019 College1 Departmen… 344 -0.326 -0.265
#> 12 2019 College1 Departmen… 256 -0.326 0.249
#> 13 2019 College1 Departmen… 434 -0.326 -0.396
#> 14 2019 College1 Departmen… 38 -0.326 -0.81
#> 15 2019 College2 Departmen… 489 1.00 33.9
#> 16 2019 College2 Departmen… 437 1.00 -0.0246
#> 17 2019 College3 Departmen… 47 -0.196 -0.946
#> 18 2019 College3 Departmen… 753 -0.196 -0.152
#> 19 2019 College3 Departmen… 319 -0.196 -0.399
#> 20 2019 College3 Departmen… 986 -0.196 1.94
Upvotes: 2