
Reputation: 285

create groups from continuous variables with increments

I'm trying to create categorical groups from continuous variables with increments.

score <- sample(1:100,20,replace=TRUE)
df <- data.frame(score)

I want to create new categorical columns based on increments of 20 (exclusive) in score column. It would look something like: I also want the names of the new categorical columns to be in that format.

df <- df%>%
  mutate(G1_0_20 = case_when(score >= 0 & score <20 ~ 1),
         G2_20_40 = case_when(score >= 20 & score < 40 ~ 1),
         G3_40_60 = case_when(score >= 40 & score < 60 ~ 1),
         G4_60_80 = case_when(score >= 60 & score < 80 ~ 1),
         G5_80_100 = case_when(score >= 80 & score < 100 ~ 1))
df[] <- 0

I'm wondering if there's a easier and faster way of creating this for a dataset that needs groups created from increments of 20 from values 0 to 4000.

Also, what if I want increments of 20 from values 0 to 100, and then increments of 100 from 200 to 300.

I would appreciate all the help there is! Thanks!!!

Upvotes: 3

Views: 340

Answers (3)


Reputation: 79338

in Base R:

a <- cut(df$score, seq(0,4000, 20))
G <- paste0(as.integer(a), sub("\\((\\d+),(\\d+)\\]", "_\\1_\\2",a))
data.frame(score = df$score, model.matrix(~G+0))

  score G1_0_20 G2_20_40 G3_40_60 G4_60_80 G5_80_100
1     31       0        1        0        0         0
2     79       0        0        0        1         0
3     51       0        0        1        0         0
4     14       1        0        0        0         0
5     67       0        0        0        1         0
6     42       0        0        1        0         0
7     50       0        0        1        0         0
8     43       0        0        1        0         0
9     14       1        0        0        0         0
10    25       0        1        0        0         0
11    90       0        0        0        0         1
12    91       0        0        0        0         1
13    69       0        0        0        1         0
14    91       0        0        0        0         1
15    57       0        0        1        0         0
16    92       0        0        0        0         1
17     9       1        0        0        0         0
18    93       0        0        0        0         1
19    99       0        0        0        0         1
20    72       0        0        0        1         0

Upvotes: 2


Reputation: 18581

For this we can use dplyover::over() and create a sequence to loop over with seq(). Disclaimer: The package is not on CRAN and I'm the maintainer.

The .names argument allows us to create nice names on the fly, we can use {x_idx} to access the number of the index element and {x} to access the value of the iteration.

The example below shows a sequence from 20 to 100 but we can generate any sequence by just replacing the numbers.

score <- sample(1:100,20,replace=TRUE)
df <- data.frame(score)

library(dplyover) #

df %>% 
  mutate(over(seq(20, 100, 20),
              ~ if_else(score < .x & score > (.x - 20), 1, 0),
              .names = "G{x_idx}_{x - 20}_{x}"
#>    score G1_0_20 G2_20_40 G3_40_60 G4_60_80 G5_80_100
#> 1     31       0        1        0        0         0
#> 2     79       0        0        0        1         0
#> 3     51       0        0        1        0         0
#> 4     14       1        0        0        0         0
#> 5     67       0        0        0        1         0
#> 6     42       0        0        1        0         0
#> 7     50       0        0        1        0         0
#> 8     43       0        0        1        0         0
#> 9     14       1        0        0        0         0
#> 10    25       0        1        0        0         0
#> 11    90       0        0        0        0         1
#> 12    91       0        0        0        0         1
#> 13    69       0        0        0        1         0
#> 14    91       0        0        0        0         1
#> 15    57       0        0        1        0         0
#> 16    92       0        0        0        0         1
#> 17     9       1        0        0        0         0
#> 18    93       0        0        0        0         1
#> 19    99       0        0        0        0         1
#> 20    72       0        0        0        1         0

Created on 2023-02-27 by the reprex package (v2.0.1)

Upvotes: 2


Reputation: 887851

We may use cut to create the grouping and then with dummy_cols from fastDummies create the dummy columns

df %>%
   mutate(grp = cut(score, breaks = c(-Inf, seq(0, 4000, by = 20), Inf)), 
      grp = str_c("G", as.integer(droplevels(grp)), '_', 
      str_replace(grp, '\\((\\d+),(\\d+)\\]', 
     '\\1_\\2'))) %>% 
   dummy_cols("grp", remove_selected_columns = TRUE) %>% 
   rename_with(~ str_remove(.x, 'grp_'), starts_with('grp_'))


    score G1_0_20 G2_20_40 G3_40_60 G4_60_80 G5_80_100
1     20       1        0        0        0         0
2     99       0        0        0        0         1
3     44       0        0        1        0         0
4     28       0        1        0        0         0
5     63       0        0        0        1         0
6     88       0        0        0        0         1
7     44       0        0        1        0         0
8     59       0        0        1        0         0
9    100       0        0        0        0         1
10    55       0        0        1        0         0
11    37       0        1        0        0         0
12    54       0        0        1        0         0
13     6       1        0        0        0         0
14     7       1        0        0        0         0
15    48       0        0        1        0         0
16    88       0        0        0        0         1
17    97       0        0        0        0         1
18    10       1        0        0        0         0
19    65       0        0        0        1         0
20    18       1        0        0        0         0

Upvotes: 4

Related Questions