maaas
maaas

Reputation: 115

R: Create dummy if column includes duplicate given group

I would like to create a dummy variable that takes the value 1 if an individual is observed in two or more different age groups and 0 otherwise.

Is someone able to do that and could explain it to me?

A small example could be:

set.seed(123)
df <- data.frame(id = sample(1:10, 30, replace = TRUE),
             agegroup = sample(c("5054", "5559", "6065"), 30, replace = TRUE))

And expected output:

id  agegroup    dummy
 3     6065       1
 8     6065       1
 5     6065       1
 9     6065       1
10     5054       1
 1     5559       0
 6     6065       1
 9     5054       1
 6     5054       1
 5     5054       1
10     5054       1
 5     5559       1
 7     5559       1
 6     5559       1
 2     5054       1
 9     5054       1
 3     5054       1
 1     5559       0
 4     5054       0
10     6065       1
 9     5054       1
 7     5559       1
 7     6065       1
10     5054       1
 7     5559       1
 8     5054       1
 6     5054       1
 6     6065       1
 3     6065       1
 2     5559       1

Upvotes: 6

Views: 2028

Answers (6)

moodymudskipper
moodymudskipper

Reputation: 47330

In base R (nor preserving row order), or with dplyr (preserving row order)

base R

merge(df, transform(unique(df),dummy = as.numeric(duplicated(id)|duplicated(id,fromLast = T))))

#    id agegroup dummy
# 1   1     5559     0
# 2   1     5559     0
# 3  10     5054     1
# 4  10     5054     1
# 5  10     5054     1
# 6  10     6065     1
# 7   2     5054     1
# 8   2     5559     1
# 9   3     5054     1
# 10  3     6065     1
# 11  3     6065     1
# 12  4     5054     0
# 13  5     5054     1
# 14  5     5559     1
# 15  5     6065     1
# 16  6     5054     1
# 17  6     5054     1
# 18  6     5559     1
# 19  6     6065     1
# 20  6     6065     1
# 21  7     5559     1
# 22  7     5559     1
# 23  7     5559     1
# 24  7     6065     1
# 25  8     5054     1
# 26  8     6065     1
# 27  9     5054     1
# 28  9     5054     1
# 29  9     5054     1
# 30  9     6065     1

dplyr

library(dplyr)
df %>%
  distinct %>% 
  mutate(dummy = as.numeric(duplicated(id)|duplicated(id,fromLast = T))) %>%
  inner_join(df,.)

#    id agegroup dummy
# 1   3     6065     1
# 2   8     6065     1
# 3   5     6065     1
# 4   9     6065     1
# 5  10     5054     1
# 6   1     5559     0
# 7   6     6065     1
# 8   9     5054     1
# 9   6     5054     1
# 10  5     5054     1
# 11 10     5054     1
# 12  5     5559     1
# 13  7     5559     1
# 14  6     5559     1
# 15  2     5054     1
# 16  9     5054     1
# 17  3     5054     1
# 18  1     5559     0
# 19  4     5054     0
# 20 10     6065     1
# 21  9     5054     1
# 22  7     5559     1
# 23  7     6065     1
# 24 10     5054     1
# 25  7     5559     1
# 26  8     5054     1
# 27  6     5054     1
# 28  6     6065     1
# 29  3     6065     1
# 30  2     5559     1

Upvotes: 0

MKR
MKR

Reputation: 20095

An option is to use dplyr::group_by(id) and count unique agegroup. Your data contains duplicate rows for id and agegroup combination.

Edit: Updated with comments from @Henrik

library(dplyr)

df %>% group_by(id) %>%
  mutate(dummy = as.integer(n_distinct(agegroup) > 1))    

# # A tibble: 30 x 3
# # Groups: id [10]
#      id agegroup dummy
#   <int> <fctr>   <int>
# 1     3 6065         1
# 2     8 6065         1
# 3     5 6065         1
# 4     9 6065         1
# 5    10 5054         1
# 6     1 5559         0
# 7     6 6065         1
# 8     9 5054         1
# 9     6 5054         1
# 10     5 5054         1
# # ... with 20 more rows

Upvotes: 5

Sathish
Sathish

Reputation: 12723

Here is a data.table solution

library('data.table')
setDT(df)[, N := uniqueN(agegroup), by = .(id)][N == 1, N := 0 ][N > 1, N := 1]

or

setDT(df)[, N := as.integer( uniqueN(agegroup) > 1 ), by = .(id)]

Upvotes: 4

Karolis Koncevičius
Karolis Koncevičius

Reputation: 9656

One more base R solution with ifelse:

df$dummy <- ifelse(df$id %in% names(which(lengths(tapply(df$agegroup, df$id, unique)) > 1)), 1, 0)

head(df)
  id agegroup dummy
1  3     6065     1
2  8     6065     1
3  5     6065     1
4  9     6065     1
5 10     5054     1
6  1     5559     0

Upvotes: 2

www
www

Reputation: 39154

Another dplyr solution. n_distinct is to count distinct numbers. (dummy > 1) + 0L is to evaluate if the number is larger than 1 and convert to integer.

library(dplyr)

df2 <- df %>%
  group_by(id) %>%
  mutate(dummy = n_distinct(agegroup)) %>%
  mutate(dummy = (dummy > 1) + 0L) %>%
  ungroup()
df2
# # A tibble: 30 x 3
#       id agegroup dummy
#    <int> <fct>    <int>
#  1     3 6065         1
#  2     8 6065         1
#  3     5 6065         1
#  4     9 6065         1
#  5    10 5054         1
#  6     1 5559         0
#  7     6 6065         1
#  8     9 5054         1
#  9     6 5054         1
# 10     5 5054         1
# # ... with 20 more rows

Upvotes: 3

Rui Barradas
Rui Barradas

Reputation: 76565

Using base R you can do this with ave. You only need to coerce the factor or character variable agegroup to numeric.

df$dummy <- ave(as.numeric(as.character(df$agegroup)), df$id, FUN = function(x) length(unique(x)) != 1)
head(df, 10)
#   id agegroup dummy
#1   3     6065     1
#2   8     6065     1
#3   5     6065     1
#4   9     6065     1
#5  10     5054     1
#6   1     5559     0
#7   6     6065     1
#8   9     5054     1
#9   6     5054     1
#10  5     5054     1

Upvotes: 3

Related Questions