user113156
user113156

Reputation: 7107

split a data frame and map over lists

I have some data which looks like the following:

library(sweep)

data <- bike_sales

data$group <- sample(1:4, 15644, replace = TRUE)


data %>% 
  split(.$group)

I want to filter some of the data based on these splits/groups.

Filter list 1 if price is less than 1500 (i.e. keep all these observations)

Take a random sample of list 2

Filter list 3 if price is between 3000 - 5000

Filter list 4 if price is greater than 7000

Once all these splits have been filtered, use bind_rows to put it all back together again. I just don't know how to start by using the map function here.

Data:

# A tibble: 3,887 x 18
   order.date order.id order.line quantity price price.ext customer.id bikeshop.name             bikeshop.city bikeshop.state latitude longitude product.id model                          category.primary category.secondary frame   group
   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>       <dbl> <chr>                     <chr>         <chr>             <dbl>     <dbl>      <dbl> <chr>                          <chr>            <chr>              <chr>   <int>
 1 2011-01-10        3          2        1  3200      3200           6 Louisville Race Equipment Louisville    KY                 38.3     -85.8         50 Jekyll Carbon 4                Mountain         Over Mountain      Carbon      4
 2 2011-01-10        3          4        1  5330      5330           6 Louisville Race Equipment Louisville    KY                 38.3     -85.8          4 Supersix Evo Hi-Mod Dura Ace 2 Road             Elite Road         Carbon      4
 3 2011-01-10        3          5        1  1570      1570           6 Louisville Race Equipment Louisville    KY                 38.3     -85.8         34 Synapse Disc 105               Road             Endurance Road     Alumin~     4
 4 2011-01-12        7          5        1  2340      2340           9 Minneapolis Bike Shop     Minneapolis   MN                 45.0     -93.3         87 Habit 5                        Mountain         Trail              Alumin~     4
 5 2011-01-12        7          9        1  3200      3200           9 Minneapolis Bike Shop     Minneapolis   MN                 45.0     -93.3         61 Scalpel-Si 5                   Mountain         Cross Country Race Alumin~     4
 6 2011-01-12        7         10        8  1410     11280           9 Minneapolis Bike Shop     Minneapolis   MN                 45.0     -93.3         18 CAAD8 105                      Road             Elite Road         Alumin~     4
 7 2011-01-17        9          2        8  3200     25600          25 New Orleans Velocipedes   New Orleans   LA                 30.0     -90.1         13 CAAD12 Red                     Road             Elite Road         Alumin~     4
 8 2011-01-18       11          2        8  3200     25600          19 San Francisco Cruisers    San Francisco CA                 37.8    -122.          13 CAAD12 Red                     Road             Elite Road         Alumin~     4
 9 2011-01-18       11          4        1  4500      4500          19 San Francisco Cruisers    San Francisco CA                 37.8    -122.          39 Slice Hi-Mod Dura Ace D12      Road             Triathalon         Carbon      4
10 2011-01-18       11          5        1  7460      7460          19 San Francisco Cruisers    San Francisco CA                 37.8    -122.          23 Synapse Hi-Mod Disc Red        Road             Endurance Road     Carbon      4
# ... with 3,877 more rows

Upvotes: 1

Views: 91

Answers (3)

Parfait
Parfait

Reputation: 107642

Consider base R, also without splitting or mapping by using transform, merge, and subset. Specifically, merge to a separate data frame for lower/upper range assignments for later filtering. But for the special group 2 sampling, a required object is needed grp2_sample using row.names:

grp2_sample <- sample(rownames(bike_sales[bike_sales$group == 2,]), 5)   # SAMPLE OF 5

sub_df <- subset(merge(transform(bike_sales, rn = row.names(bike_sales)),
                       data.frame(group = c(1,3,4),
                                  lower = c(-Inf, 3000, 7000),
                                  upper = c(1500, 5000, Inf)),
                       by ="group", all.x=TRUE),
                 (price >= lower & price <= upper) | (rn %in% grp2_sample)
          )

Alternatively with dplyr using counterpart mutate, left_join, and filter:

library(dplyr)
...
grp2_sample <- sample(rownames(bike_sales[bike_sales$group == 2,]), 5)   # SAMPLE OF 5

sub_df2 <- bike_sales %>%
             mutate(rn = row.names(bike_sales)) %>%
             left_join(data.frame(group = c(1,3,4),
                                  lower = c(-Inf, 3000, 7000),
                                  upper = c(1500, 5000, Inf)), 
                       by="group") %>%
             filter((price >= lower & price <= upper) | (rn %in% grp2_sample))

And even a data.table alternative solution:

library(data.table)
...
grp2_sample <- sample(rownames(bike_sales[bike_sales$pick == 2,]), 5)    # SAMPLE OF 5

sub_dt <- setDT(bike_sales)[, rn := row.names(bike_sales)][
                            data.table(group = c(1,3,4),
                                       lower = c(-Inf, 3000, 7000),
                                       upper = c(1500, 5000, Inf)), 
                            on="group", 
                            `:=`(lower=i.lower, upper=i.upper)
                           ][(price >= lower & price <= upper) | (rn %in% grp2_sample),]

Upvotes: 1

B. Christian Kamgang
B. Christian Kamgang

Reputation: 6489

Using data.table package (without splitting the data)

library(data.table)
setDT(data, key = "group")

fun <- function(x, grp, df) {
  if(grp == 1) df[x < 1500] else
    if(grp == 2) df[sample(nrow(df), 1)] else       # sample one row
      if(grp == 3) df[between(x, 3000, 5000)] else
        if(grp == 4) df[x > 7000]
}

data[, fun(price, .GRP, .SD), group]

Upvotes: 2

camille
camille

Reputation: 16842

Following up on @antoine-sac's suggestion of making a threshold argument, I'd recommend a list of parameters for each group. You've got several pieces of metadata for each group: a lower limit (or -Inf for cases with no lower limit), an upper limit (or Inf if no upper limit), and whether to sample instead of filter. If you're sampling, you'll just do that instead of filtering.

library(dplyr)
library(purrr)
library(sweep)

set.seed(1248)
data <- bike_sales
data$group <- sample(1:4, 15644, replace = TRUE)

params <- list(
  `1` = list(low = -Inf, high = 1500, samp = F),
  `2` = list(low = NULL, high = NULL, samp = T),
  `3` = list(low = 3000, high = 5000, samp = F),
  `4` = list(low = 7000, high = Inf, samp = F)
)

data_filtered <- data %>% 
  split(.$group) %>%
  map2(params, function(dat, p) {
    if (p$samp) {
      sample_n(dat, 1)
    } else {
      dat %>%
        filter(between(price, p$low, p$high))
    }
  })

These are big, so here's a tiny subset of each:

data_filtered %>% map(~select(., 1:6) %>% head(3))
#> $`1`
#> # A tibble: 3 x 6
#>   order.date order.id order.line quantity price price.ext
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>
#> 1 2011-01-11        5          1        1   480       480
#> 2 2011-01-12        7         10        8  1410     11280
#> 3 2011-01-12        8          1        1  1250      1250
#> 
#> $`2`
#> # A tibble: 1 x 6
#>   order.date order.id order.line quantity price price.ext
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>
#> 1 2012-07-11      522          3        1  6390      6390
#> 
#> $`3`
#> # A tibble: 3 x 6
#>   order.date order.id order.line quantity price price.ext
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>
#> 1 2011-01-11        4          1        1  4800      4800
#> 2 2011-01-18       11          2        8  3200     25600
#> 3 2011-01-18       11          6        1  3200      3200
#> 
#> $`4`
#> # A tibble: 3 x 6
#>   order.date order.id order.line quantity price price.ext
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>
#> 1 2011-01-18       11          5        1  7460      7460
#> 2 2011-01-20       12          9        1  9590      9590
#> 3 2011-01-20       12         19        1  7460      7460

In the case where you want them all row-bound back into a single data frame, use map2_dfr instead of map2.

data %>% 
  split(.$group) %>%
  map2_dfr(params, function(dat, p) { ## <--- change here
    if (p$samp) {
      sample_n(dat, 1)
    } else {
      dat %>%
        filter(between(price, p$low, p$high))
    }
  }) %>%
  head(3)
#> # A tibble: 3 x 18
#>   order.date order.id order.line quantity price price.ext customer.id
#>   <date>        <dbl>      <int>    <dbl> <dbl>     <dbl>       <dbl>
#> 1 2011-01-11        5          1        1   480       480           8
#> 2 2011-01-12        7         10        8  1410     11280           9
#> 3 2011-01-12        8          1        1  1250      1250          16
#> # … with 11 more variables: bikeshop.name <chr>, bikeshop.city <chr>,
#> #   bikeshop.state <chr>, latitude <dbl>, longitude <dbl>,
#> #   product.id <dbl>, model <chr>, category.primary <chr>,
#> #   category.secondary <chr>, frame <chr>, group <int>

Upvotes: 3

Related Questions