Reputation: 133
I have a dataset where I have the primary ID and wave along with a flag for each type (within a wave). It looks something like this.
| id | wave | TypeA | TypeB | TypeC |
|----|------|-------|-------|-------|
| 1 | 1 | 1 | 0 | 0 |
| 2 | 1 | 0 | 0 | 1 |
| 3 | 1 | 0 | 1 | 0 |
| 4 | 1 | 1 | 0 | 0 |
| 5 | 1 | 0 | 1 | 0 |
| 1 | 2 | 1 | 0 | 0 |
| 2 | 2 | 1 | 0 | 0 |
| 3 | 2 | 0 | 0 | 1 |
| 4 | 2 | 1 | 0 | 0 |
| 5 | 2 | 0 | 1 | 0 |
| 6 | 2 | 0 | 0 | 1 |
I would like to create a new column for each type that holds the proportion of each type of individual in a wave. Each wave doesn't have the same number of individuals. The output should look like this:
| id | wave | TypeA | TypeB | TypeC | TypeA_prop | TypeB_prop | TypeC_prop |
|----|------|-------|-------|-------|------------|------------|------------|
| 1 | 1 | 1 | 0 | 0 | 0.4 | 0.4 | 0.1 |
| 2 | 1 | 0 | 0 | 1 | 0.4 | 0.4 | 0.1 |
| 3 | 1 | 0 | 1 | 0 | 0.4 | 0.4 | 0.1 |
| 4 | 1 | 1 | 0 | 0 | 0.4 | 0.4 | 0.1 |
| 5 | 1 | 0 | 1 | 0 | 0.4 | 0.4 | 0.1 |
| 1 | 2 | 1 | 0 | 0 | 0.5 | 0.17 | 0.33 |
| 2 | 2 | 1 | 0 | 0 | 0.5 | 0.17 | 0.33 |
| 3 | 2 | 0 | 0 | 1 | 0.5 | 0.17 | 0.33 |
| 4 | 2 | 1 | 0 | 0 | 0.5 | 0.17 | 0.33 |
| 5 | 2 | 0 | 1 | 0 | 0.5 | 0.17 | 0.33 |
| 6 | 2 | 0 | 0 | 1 | 0.5 | 0.17 | 0.33 |
I don't usually use R with panel data so I'm new to this. How can I go about this (preferably with dplyr)? TIA
Upvotes: 0
Views: 45
Reputation: 8880
using data.table
df <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
6L), wave = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), TypeA = c(1L,
0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L), TypeB = c(0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L), TypeC = c(0L, 1L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 1L)), row.names = c(NA, -11L), class = "data.frame")
library(data.table)
cols_select <- grep("^Type", names(df), value = T)
nm_prop <- paste0(cols_select, "_prop")
setDT(df)
out <- df[, (nm_prop) := lapply(.SD, mean, na.rm = T), by = wave, .SDcols = cols_select]
head(out)
#> id wave TypeA TypeB TypeC TypeA_prop TypeB_prop TypeC_prop
#> 1: 1 1 1 0 0 0.4 0.4000000 0.2000000
#> 2: 2 1 0 0 1 0.4 0.4000000 0.2000000
#> 3: 3 1 0 1 0 0.4 0.4000000 0.2000000
#> 4: 4 1 1 0 0 0.4 0.4000000 0.2000000
#> 5: 5 1 0 1 0 0.4 0.4000000 0.2000000
#> 6: 1 2 1 0 0 0.5 0.1666667 0.3333333
Created on 2021-03-03 by the reprex package (v1.0.0)
Upvotes: 0
Reputation: 11594
Does this work:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -c(id, wave)) %>%
group_by(wave) %>%
mutate(tot = sum(value)) %>%
group_by(wave, name) %>% mutate(type_tot = sum(value)) %>%
mutate(prop = type_tot/tot) %>%
select(id, wave, name, value,prop) %>%
pivot_wider(id_cols = c(id,wave), names_from = name, values_from = c(value,prop))
# A tibble: 11 x 8
# Groups: wave [2]
id wave value_TypeA value_TypeB value_TypeC prop_TypeA prop_TypeB prop_TypeC
<int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
1 1 1 1 0 0 0.4 0.4 0.2
2 2 1 0 0 1 0.4 0.4 0.2
3 3 1 0 1 0 0.4 0.4 0.2
4 4 1 1 0 0 0.4 0.4 0.2
5 5 1 0 1 0 0.4 0.4 0.2
6 1 2 1 0 0 0.5 0.167 0.333
7 2 2 1 0 0 0.5 0.167 0.333
8 3 2 0 0 1 0.5 0.167 0.333
9 4 2 1 0 0 0.5 0.167 0.333
10 5 2 0 1 0 0.5 0.167 0.333
11 6 2 0 0 1 0.5 0.167 0.333
Data used:
df
id wave TypeA TypeB TypeC
1 1 1 1 0 0
2 2 1 0 0 1
3 3 1 0 1 0
4 4 1 1 0 0
5 5 1 0 1 0
6 1 2 1 0 0
7 2 2 1 0 0
8 3 2 0 0 1
9 4 2 1 0 0
10 5 2 0 1 0
11 6 2 0 0 1
Upvotes: 0
Reputation: 389145
If you have binary (1/0) values in Type
column you can take mean
of columns for each wave
.
library(dplyr)
df %>%
group_by(wave) %>%
mutate(across(starts_with('Type'), mean, .names = '{col}_prop')) %>%
ungroup
# id wave TypeA TypeB TypeC TypeA_prop TypeB_prop TypeC_prop
# <int> <int> <int> <int> <int> <dbl> <dbl> <dbl>
# 1 1 1 1 0 0 0.4 0.4 0.2
# 2 2 1 0 0 1 0.4 0.4 0.2
# 3 3 1 0 1 0 0.4 0.4 0.2
# 4 4 1 1 0 0 0.4 0.4 0.2
# 5 5 1 0 1 0 0.4 0.4 0.2
# 6 1 2 1 0 0 0.5 0.167 0.333
# 7 2 2 1 0 0 0.5 0.167 0.333
# 8 3 2 0 0 1 0.5 0.167 0.333
# 9 4 2 1 0 0 0.5 0.167 0.333
#10 5 2 0 1 0 0.5 0.167 0.333
#11 6 2 0 0 1 0.5 0.167 0.333
data
df <- structure(list(id = c(1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L,
6L), wave = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), TypeA = c(1L,
0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L), TypeB = c(0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L), TypeC = c(0L, 1L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 1L)), row.names = c(NA, -11L), class = "data.frame")
Upvotes: 2