Reputation: 7147
I am trying to make list splits based on some category data in R.
I have this data:
# A tibble: 5 x 2
category to_split
<chr> <chr>
1 cat12 c(1, 5)
2 cat22 c(2, 5, 1)
3 cat33 3
4 cat43 4
5 cat51 c(5, 2)
Where c(1, 5)
corresponds to row 1 and row 5 in this data. c(2, 5, 1)
corresponds to row 2, row 5 and row 1.
I also have a second larger data frame which looks like:
# A tibble: 100 x 4
# Groups: station_location [5]
category var1 var2 var3
<chr> <dbl> <dbl> <dbl>
1 cat12 7 0.4 10
2 cat12 20 1.1 155
3 cat12 12 0.4 3
4 cat12 4 0.3 38
5 cat12 13 0.4 40
6 cat12 7 0.3 17
7 cat12 9 0.4 45
8 cat12 3 0.3 17
9 cat12 8 0.5 84
10 cat12 32 2.6 378
# ... with 90 more rows
I would like to create new lists such that c(1, 5)
extracts the cat12
and cat51
data from the large data frame. Also where c(2, 5, 1)
extracts the cat22
, cat51
and cat12
data and stores this data in a data frame (inside a list).
I would like to get the list structure something as follows:
list(
c(1, 5) - a data frame containing the two corresponding categories of data
c(2, 5, 1) - a data frame contained the three corresponding categories of data
3
4
c(5, 2)
)
(I don't care about the list names being c(1, 5)...
etc.). I will name them from which the category the data came from (in the small data frame), i.e.
list(
cat12 - data frame containing the two corresponding categories
cat22
cat33
cat43
cat51
)
I am trying to extract the relevant data from the larger data frame by the mappings in the smaller data frame.
Data:
data_join <- structure(list(category = c("cat12", "cat22", "cat33", "cat43",
"cat51"), to_split = c("c(1, 5)", "c(2, 5, 1)", "3", "4", "c(5, 2)"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L))
full_data <- structure(list(category = c("cat12", "cat12", "cat12", "cat12",
"cat12", "cat12", "cat12", "cat12", "cat12", "cat12", "cat12",
"cat12", "cat12", "cat12", "cat12", "cat12", "cat12", "cat12",
"cat12", "cat12", "cat22", "cat22", "cat22", "cat22", "cat22",
"cat22", "cat22", "cat22", "cat22", "cat22", "cat22", "cat22",
"cat22", "cat22", "cat22", "cat22", "cat22", "cat22", "cat22",
"cat22", "cat33", "cat33", "cat33", "cat33", "cat33", "cat33",
"cat33", "cat33", "cat33", "cat33", "cat33", "cat33", "cat33",
"cat33", "cat33", "cat33", "cat33", "cat33", "cat33", "cat33",
"cat43", "cat43", "cat43", "cat43", "cat43", "cat43", "cat43",
"cat43", "cat43", "cat43", "cat43", "cat43", "cat43", "cat43",
"cat43", "cat43", "cat43", "cat43", "cat43", "cat43", "cat51",
"cat51", "cat51", "cat51", "cat51", "cat51", "cat51", "cat51",
"cat51", "cat51", "cat51", "cat51", "cat51", "cat51", "cat51",
"cat51", "cat51", "cat51", "cat51", "cat51"), var1 = c(7, 20,
12, 4, 13, 7, 9, 3, 8, 32, 5, 2, 14, 7, 11, 9, 25, 5, 6, 18,
14, 12, 11, 11, 5, 7, 12, 2, 7, 7, 5, 28, 6, 8, 4, 9, 4, 11,
6, 5, NA, NA, 24, 6, 6, 29, NA, 11, NA, NA, NA, 9, NA, 8, 7,
NA, 17, 6, NA, 6, NA, NA, NA, NA, NA, NA, NA, NA, 13, NA, NA,
NA, NA, 16, 7, 8, NA, NA, 10, 19, 6, 10, 3, 12, 2, 2, 7, 11,
5, 5, 6, 3, 6, 9, 11, 11, 12, 5, 14, 5), var2 = c(0.4, 1.1, 0.4,
0.3, 0.4, 0.3, 0.4, 0.3, 0.5, 2.6, 0.6, 0.3, 0.5, 0.4, 0.4, 0.7,
0.5, 0.3, 0.4, 0.6, 0.5, 0.3, 0.4, 0.2, 0.4, 0.5, 0.5, 0.3, 0.4,
0.3, 0.4, 1.1, 0.4, 0.5, 0.2, 0.5, 0.4, 0.5, 0.6, 0.6, NA, NA,
0.7, 0.1, 0.3, 0.5, NA, 0.7, NA, NA, NA, 0.2, NA, 0.3, 0.2, NA,
0.3, 0.3, NA, 0.1, 0.2, 0.2, 0.5, 0.4, 0.3, 0.4, 0.2, 0.4, 0.3,
0.3, 0.2, 0.3, 0.2, 0.4, 0.2, 0.2, 0.3, 0.3, 0.5, 0.5, 0.4, 0.2,
0.3, 0.7, 0.3, 0.1, 0.3, 0.3, 0.4, 0.6, 0.3, 0.2, 0.4, 0.6, 0.2,
0.7, 0.6, 0.4, 0.6, 0.5), var3 = c(10, 155, 3, 38, 40, 17, 45,
17, 84, 378, 44, 14, 36, 20, 17, 76, 25, 4, 22, 63, 42, 23, 12,
10, 15, 29, 26, 7, 18, 5, 23, 204, 24, 56, 7, 35, 23, 55, 28,
65, 10, 13, 54, 13, 22, 45, 29, 58, 49, 14, 2, 9, 15, 38, 41,
63, 11, 9, 7, 20, 3, 5, 52, 7, 18, 25, 2, 30, 10, 3, 3, 13, 1,
12, 7, 5, 5, 9, 13, 4, 14, 9, 8, 147, 5, 7, 2, 10, 6, 66, 2,
8, 6, 3, 8, 5, 45, 6, 20, 27)), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
station_location = c("cat12", "cat22", "cat33", "cat43",
"cat51"), .rows = list(1:20, 21:40, 41:60, 61:80, 81:100)), row.names = c(NA,
-5L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))
Upvotes: 1
Views: 47
Reputation: 3700
It is not clear why the categories and the splits are defined in the same data frame. If each split is a subset of the five categories, there are 31 = 2^5 - 1 possible splits with at least one category.
# Give each category an id
cats <- data_join %>%
select(category) %>%
mutate(
.id = row_number()
)
head(cats)
#> # A tibble: 5 x 2
#> category .id
#> <chr> <int>
#> 1 cat12 1
#> 2 cat22 2
#> 3 cat33 3
#> 4 cat43 4
#> 5 cat51 5
# Figure out which category goes into each split/subset (Could have more/fewer than 5?)
subsets <- data_join %>%
select(to_split) %>%
mutate(.id = map(to_split, ~ eval(parse(text = .)))) %>%
unnest(.id) %>%
inner_join(cats, by = ".id")
head(subsets)
#> # A tibble: 6 x 3
#> to_split .id category
#> <chr> <dbl> <chr>
#> 1 c(1, 5) 1 cat12
#> 2 c(1, 5) 5 cat51
#> 3 c(2, 5, 1) 2 cat22
#> 4 c(2, 5, 1) 5 cat51
#> 5 c(2, 5, 1) 1 cat12
#> 6 3 3 cat33
# Now you can join the subsets with the data
df <- full_data %>%
inner_join(
subsets, by = "category"
)
# And split by group
splits <- split(df, df$to_split)
names(splits)
#> [1] "3" "4" "c(1, 5)" "c(2, 5, 1)" "c(5, 2)"
Created on 2019-11-01 by the reprex package (v0.3.0)
Upvotes: 1