Reputation: 702
I have a dataframe as follows:
library("dplyr")
df <- data.frame(
name=c('group1', 'group2'),
n_success=c(32, 30),
n=c(122, 123),
stringsAsFactors = FALSE
)
For each group, I take 1000 samples from a beta distribution:
df <- df %>%
mutate(sims = list(rbeta(1000, 1+n_success, 1+n-n_success))) %>%
select(name, sims)
# str(df)
# prints out:
# name: chr "group1" "group2"
# sims: List of 1
I now have a dataframe where each row consists of a string and of a list.
How do I go from this to a dataframe where the column names are "group1" and "group2", and each of the columns are the 1000 observed simulations above? Note that the number of groups might be pretty arbitrary, so if I had 12 groups, I would like 12 columns.
Upvotes: 1
Views: 2096
Reputation: 83215
Using:
library(dplyr)
library(tidyr)
df %>%
mutate(sims = list(rbeta(1000, 1+n_success, 1+n-n_success))) %>%
select(name, sims) %>%
unnest() %>%
group_by(name) %>%
mutate(rn = row_number()) %>%
spread(name, sims) %>%
select(group1, group2)
you get:
group1 group2 * <dbl> <dbl> 1 0.2448308 0.2448308 2 0.2580710 0.2580710 3 0.2249618 0.2249618 4 0.2652175 0.2652175 5 0.3002762 0.3002762 6 0.1852094 0.1852094 7 0.2706153 0.2706153 8 0.2580558 0.2580558 9 0.2264272 0.2264272 10 0.3198264 0.3198264 # ... with 990 more rows
The data.table
-package might be better suited for the required transformation. Using:
library(data.table)
dcast(setDT(df)[, .(sims = rbeta(1000, 1+n_success, 1+n-n_success)), by = name],
rowid(name) ~ name, value.var = 'sims')[, name := NULL][]
you get:
group1 group2 1: 0.2882302 0.3061312 2: 0.2615165 0.2763967 3: 0.2885236 0.2516134 4: 0.2516337 0.2455496 5: 0.2635944 0.2267952 --- 996: 0.2658737 0.2525680 997: 0.3045952 0.2193125 998: 0.2505284 0.1967361 999: 0.2723949 0.2389607 1000: 0.2544297 0.2477589
An alternative in base R:
f <- function(x) rbeta(1000, 1+x[['n_success']], 1+x[['n']]-x[['n_success']])
lst_1 <- split(df, df$name)
lst_2 <- lapply(lst_1, f)
do.call(cbind.data.frame, lst_2)
Upvotes: 2
Reputation: 10162
You can also stick to dplyr
and the tidyverse
. I would do it like so
library(dplyr)
library(tidyr) # for unnest() and spread()
df <- data.frame(
name=c('group1', 'group2'),
n_success=c(32, 30),
n=c(122, 123),
stringsAsFactors = FALSE
)
# continuing your approach (be aware that I added a list() and closed a missing parenthesis)
df2 <- df %>%
mutate(sims = list(rbeta(1000, 1+n_success, 1+n-n_success))) %>%
select(name, sims)
str(df2)
#> 'data.frame': 2 obs. of 2 variables:
#> $ name: chr "group1" "group2"
#> $ sims:List of 2
#> ..$ : num 0.178 0.313 0.272 0.25 0.271 ...
#> ..$ : num 0.178 0.313 0.272 0.25 0.271 ...
# using unnest and mutate to create a variable that labels the rows
df3 <- df2 %>% unnest %>% group_by(name) %>% mutate(num = 1:n())
df3
#> # A tibble: 2,000 x 3
#> # Groups: name [2]
#> name sims num
#> <chr> <dbl> <int>
#> 1 group1 0.1779776 1
#> 2 group1 0.3134262 2
#> 3 group1 0.2724994 3
#> 4 group1 0.2496521 4
#> 5 group1 0.2714030 5
#> 6 group1 0.2192758 6
#> 7 group1 0.2056501 7
#> 8 group1 0.2210970 8
#> 9 group1 0.2505481 9
#> 10 group1 0.2945622 10
#> # ... with 1,990 more rows
# spread the data-frame again
df_final <- df3 %>% spread(key = name, value = sims)
df_final
#> # A tibble: 1,000 x 3
#> num group1 group2
#> * <int> <dbl> <dbl>
#> 1 1 0.1779776 0.1779776
#> 2 2 0.3134262 0.3134262
#> 3 3 0.2724994 0.2724994
#> 4 4 0.2496521 0.2496521
#> 5 5 0.2714030 0.2714030
#> 6 6 0.2192758 0.2192758
#> 7 7 0.2056501 0.2056501
#> 8 8 0.2210970 0.2210970
#> 9 9 0.2505481 0.2505481
#> 10 10 0.2945622 0.2945622
#> # ... with 990 more rows
If you don't want/need the num-variable you can deselect it again with select(df_final, -num)
.
Does that help you?
Upvotes: 1