user113156
user113156

Reputation: 7147

extracting data from different data frames based on another data frame mappings

I am trying to make list splits based on some category data in R.

I have this data:

# A tibble: 5 x 2
  category to_split  
  <chr>    <chr>     
1 cat12    c(1, 5)   
2 cat22    c(2, 5, 1)
3 cat33    3         
4 cat43    4         
5 cat51    c(5, 2)

Where c(1, 5) corresponds to row 1 and row 5 in this data. c(2, 5, 1) corresponds to row 2, row 5 and row 1.

I also have a second larger data frame which looks like:

# A tibble: 100 x 4
# Groups:   station_location [5]
   category  var1  var2  var3
   <chr>    <dbl> <dbl> <dbl>
 1 cat12        7   0.4    10
 2 cat12       20   1.1   155
 3 cat12       12   0.4     3
 4 cat12        4   0.3    38
 5 cat12       13   0.4    40
 6 cat12        7   0.3    17
 7 cat12        9   0.4    45
 8 cat12        3   0.3    17
 9 cat12        8   0.5    84
10 cat12       32   2.6   378
# ... with 90 more rows

I would like to create new lists such that c(1, 5) extracts the cat12 and cat51 data from the large data frame. Also where c(2, 5, 1) extracts the cat22, cat51 and cat12 data and stores this data in a data frame (inside a list).

I would like to get the list structure something as follows:

list(
     c(1, 5)  - a data frame containing the two corresponding categories of data
     c(2, 5, 1) - a data frame contained the three corresponding categories of data
     3
     4
     c(5, 2)
)

(I don't care about the list names being c(1, 5)... etc.). I will name them from which the category the data came from (in the small data frame), i.e.

list(
     cat12  - data frame containing the two corresponding categories
     cat22
     cat33
     cat43
     cat51
)

I am trying to extract the relevant data from the larger data frame by the mappings in the smaller data frame.

Data:

data_join <- structure(list(category = c("cat12", "cat22", "cat33", "cat43", 
"cat51"), to_split = c("c(1, 5)", "c(2, 5, 1)", "3", "4", "c(5, 2)"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, 
-5L))


full_data <- structure(list(category = c("cat12", "cat12", "cat12", "cat12", 
"cat12", "cat12", "cat12", "cat12", "cat12", "cat12", "cat12", 
"cat12", "cat12", "cat12", "cat12", "cat12", "cat12", "cat12", 
"cat12", "cat12", "cat22", "cat22", "cat22", "cat22", "cat22", 
"cat22", "cat22", "cat22", "cat22", "cat22", "cat22", "cat22", 
"cat22", "cat22", "cat22", "cat22", "cat22", "cat22", "cat22", 
"cat22", "cat33", "cat33", "cat33", "cat33", "cat33", "cat33", 
"cat33", "cat33", "cat33", "cat33", "cat33", "cat33", "cat33", 
"cat33", "cat33", "cat33", "cat33", "cat33", "cat33", "cat33", 
"cat43", "cat43", "cat43", "cat43", "cat43", "cat43", "cat43", 
"cat43", "cat43", "cat43", "cat43", "cat43", "cat43", "cat43", 
"cat43", "cat43", "cat43", "cat43", "cat43", "cat43", "cat51", 
"cat51", "cat51", "cat51", "cat51", "cat51", "cat51", "cat51", 
"cat51", "cat51", "cat51", "cat51", "cat51", "cat51", "cat51", 
"cat51", "cat51", "cat51", "cat51", "cat51"), var1 = c(7, 20, 
12, 4, 13, 7, 9, 3, 8, 32, 5, 2, 14, 7, 11, 9, 25, 5, 6, 18, 
14, 12, 11, 11, 5, 7, 12, 2, 7, 7, 5, 28, 6, 8, 4, 9, 4, 11, 
6, 5, NA, NA, 24, 6, 6, 29, NA, 11, NA, NA, NA, 9, NA, 8, 7, 
NA, 17, 6, NA, 6, NA, NA, NA, NA, NA, NA, NA, NA, 13, NA, NA, 
NA, NA, 16, 7, 8, NA, NA, 10, 19, 6, 10, 3, 12, 2, 2, 7, 11, 
5, 5, 6, 3, 6, 9, 11, 11, 12, 5, 14, 5), var2 = c(0.4, 1.1, 0.4, 
0.3, 0.4, 0.3, 0.4, 0.3, 0.5, 2.6, 0.6, 0.3, 0.5, 0.4, 0.4, 0.7, 
0.5, 0.3, 0.4, 0.6, 0.5, 0.3, 0.4, 0.2, 0.4, 0.5, 0.5, 0.3, 0.4, 
0.3, 0.4, 1.1, 0.4, 0.5, 0.2, 0.5, 0.4, 0.5, 0.6, 0.6, NA, NA, 
0.7, 0.1, 0.3, 0.5, NA, 0.7, NA, NA, NA, 0.2, NA, 0.3, 0.2, NA, 
0.3, 0.3, NA, 0.1, 0.2, 0.2, 0.5, 0.4, 0.3, 0.4, 0.2, 0.4, 0.3, 
0.3, 0.2, 0.3, 0.2, 0.4, 0.2, 0.2, 0.3, 0.3, 0.5, 0.5, 0.4, 0.2, 
0.3, 0.7, 0.3, 0.1, 0.3, 0.3, 0.4, 0.6, 0.3, 0.2, 0.4, 0.6, 0.2, 
0.7, 0.6, 0.4, 0.6, 0.5), var3 = c(10, 155, 3, 38, 40, 17, 45, 
17, 84, 378, 44, 14, 36, 20, 17, 76, 25, 4, 22, 63, 42, 23, 12, 
10, 15, 29, 26, 7, 18, 5, 23, 204, 24, 56, 7, 35, 23, 55, 28, 
65, 10, 13, 54, 13, 22, 45, 29, 58, 49, 14, 2, 9, 15, 38, 41, 
63, 11, 9, 7, 20, 3, 5, 52, 7, 18, 25, 2, 30, 10, 3, 3, 13, 1, 
12, 7, 5, 5, 9, 13, 4, 14, 9, 8, 147, 5, 7, 2, 10, 6, 66, 2, 
8, 6, 3, 8, 5, 45, 6, 20, 27)), class = c("grouped_df", "tbl_df", 
"tbl", "data.frame"), row.names = c(NA, -100L), groups = structure(list(
    station_location = c("cat12", "cat22", "cat33", "cat43", 
    "cat51"), .rows = list(1:20, 21:40, 41:60, 61:80, 81:100)), row.names = c(NA, 
-5L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))

Upvotes: 1

Views: 47

Answers (1)

dipetkov
dipetkov

Reputation: 3700

It is not clear why the categories and the splits are defined in the same data frame. If each split is a subset of the five categories, there are 31 = 2^5 - 1 possible splits with at least one category.

# Give each category an id
cats <- data_join %>%
  select(category) %>%
  mutate(
    .id = row_number()
  )
head(cats)
#> # A tibble: 5 x 2
#>   category   .id
#>   <chr>    <int>
#> 1 cat12        1
#> 2 cat22        2
#> 3 cat33        3
#> 4 cat43        4
#> 5 cat51        5

# Figure out which category goes into each split/subset (Could have more/fewer than 5?)
subsets <- data_join  %>%
  select(to_split) %>%
  mutate(.id = map(to_split, ~ eval(parse(text = .)))) %>%
  unnest(.id) %>%
  inner_join(cats, by = ".id")
head(subsets)
#> # A tibble: 6 x 3
#>   to_split     .id category
#>   <chr>      <dbl> <chr>   
#> 1 c(1, 5)        1 cat12   
#> 2 c(1, 5)        5 cat51   
#> 3 c(2, 5, 1)     2 cat22   
#> 4 c(2, 5, 1)     5 cat51   
#> 5 c(2, 5, 1)     1 cat12   
#> 6 3              3 cat33

# Now you can join the subsets with the data
df <- full_data %>%
  inner_join(
    subsets, by = "category"
  )

# And split by group
splits <- split(df, df$to_split)

names(splits)
#> [1] "3"          "4"          "c(1, 5)"    "c(2, 5, 1)" "c(5, 2)"

Created on 2019-11-01 by the reprex package (v0.3.0)

Upvotes: 1

Related Questions