Reputation: 111
I need to define a variable in df1 based on variable in df1 df1$person_id
and a variable in df2 df2$gender
. The structure is the following
I have updated the structure of the data because the previous one was wrong.
df1 <- structure(list(person_id = c("15813, 15837, 18118, 16672", "8818, 8132, 18045, 18051, 15916, 10204, 18042, 12455, 18218, 18046, 18050, 18165, 18043, 18052, 18048", "8818, 8132, 18051, 18045, 18042, 10204, 12455, 15916, 18218, 18046, 18050, 18165, 18043, 18052, 18048"), gender = c(NA, NA, NA)), row.names = 12:14, class = "data.frame")
df2 <- structure(list(id = structure(c(278L, 363L, 156L), .Label = c("10096", "10204", "11094", "11096", "11097", "11098", "11102", "11106", "11109", "11116", "11118", "11121"), class = "factor"), gender = c(1L, 1L, 1L)), row.names = c(NA, 3L), class = "data.frame")
The desired output is two columns - df1$person_id
and df1$gender
. df1$gender
may have three options: either all male, or all female, or both.
Please, could you help me?
Upvotes: 0
Views: 64
Reputation: 72593
You could write a genderize
function based on table
. And apply it on a strsplit
. I've changed author
data so that one row has both male.
genderize <- function(id) {
r <- table(gender[as.double(id), "gender"])
if (length(r) == 2) r <- 3
else r <- as.double(names(r))
factor(r, levels=1:3, labels=c("female", "male", "both"))
}
authors$genders <- sapply(strsplit(as.character(authors$ids), ", "), genderize)
ids genders
1 3, 7 male
2 3, 4, 5, 6, 7, 8 both
3 6 female
4 1, 4, 7, 8 both
Data
authors <- structure(list(ids = structure(c(3L, 2L, 4L, 1L), .Label = c("1, 4, 7, 8",
"3, 4, 5, 6, 7, 8", "3, 7", "6"), class = "factor"), genders = c(NA,
NA, NA, NA)), class = "data.frame", row.names = c(NA, -4L))
gender <- structure(list(id = 1:8, gender = c(1, 1, 2, 1, 1, 1, 2, 1)), row.names = c(NA,
-8L), class = "data.frame")
Upvotes: 0
Reputation: 14764
Another option:
idx <- strsplit(as.character(authors$ids), split = ', ')
transform(
authors,
genders = ifelse(
sapply(idx, function(x) all(c(1, 2) %in% gender$gender[match(x, gender$id)])),
'Both',
ifelse(
sapply(idx, function(x) any(1 == gender$gender[match(x, gender$id)]) & !any(2 == gender$gender[match(x, gender$id)])),
'Male',
'Female')
)
)
Output:
ids genders
1 2, 1 Male
2 3, 4, 5, 6, 7, 8 Both
3 6 Male
4 1, 4, 7, 8 Both
Upvotes: 0
Reputation: 206167
Here's how I might do this using dplyr
and tidyr
. No need for loops.
library(dplyr)
library(tidyr)
authors %>%
mutate(record=row_number(), genders=NULL, id=ids) %>%
separate_rows(id, convert=TRUE) %>%
left_join(gender) %>%
group_by(record, ids) %>%
summarize(genders=paste(gender, collapse=", "),
all_male=all(gender==1),
all_female=all(gender==2),
both=any(gender==1) & any(gender==2),
class=case_when(all(gender==1)~"Male",
all(gender==2)~"Female",
TRUE~"Both"))
Basically we first expand the comma separated values into separate rows. That allows us to easily join the data to the gender info, then we summarize to check the distribution of genders among the authors.
This gives
record ids genders all_male all_female both class
<int> <fct> <chr> <lgl> <lgl> <lgl> <chr>
1 1 2, 1 1, 1 TRUE FALSE FALSE Male
2 2 3, 4, 5, 6, 7, 8 2, 1, 1, 1, 2, 1 FALSE FALSE TRUE Both
3 3 6 1 TRUE FALSE FALSE Male
4 4 1, 4, 7, 8 1, 1, 2, 1 FALSE FALSE TRUE Both
Upvotes: 1
Reputation: 6921
Here's how I'd do it with dplyr
and tidyr
:
# Note the stringsAsFactors=T
authors <- data.frame(ids = c("2, 1", "3, 4, 5, 6, 7, 8", "6", "1, 4, 7, 8"), genders = c(NA, NA, NA, NA),
stringsAsFactors = FALSE)
gender <- structure(list(id = 1:8, gender = c(1, 1, 2, 1, 1, 1, 2, 1)),row.names = c(NA, -8L), class = "data.frame")
library("dplyr")
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
# Let's normalise your groups
groups <-
authors %>%
# create ID for the group of author
mutate(group_id = 1:n()) %>%
# split authors
mutate(author_id = strsplit(ids, split = ", ", fixed = TRUE)) %>%
tidyr::unnest(author_id) %>%
# cast to integer for compatibility with the other table
mutate(author_id = as.integer(author_id)) %>%
select(group_id, author_id)
# This is the normalized version of your group / author relationship
groups
#> # A tibble: 13 x 2
#> group_id author_id
#> <int> <int>
#> 1 1 2
#> 2 1 1
#> 3 2 3
#> 4 2 4
#> 5 2 5
#> 6 2 6
#> 7 2 7
#> 8 2 8
#> 9 3 6
#> 10 4 1
#> 11 4 4
#> 12 4 7
#> 13 4 8
# Technically this is the "author" table since it contains author details
# I'll rename it for clarity
authors <- gender
group_gender <-
groups %>%
left_join(authors, by = c("author_id" = "id")) %>%
group_by(group_id) %>%
summarise(gender = case_when(
all(gender == 1) ~ "Male",
all(gender == 2) ~ "Female",
TRUE ~ "Both"
))
# I took the liberty to create a single column with the gender.
# It makes it easier to compute gender stats.
group_gender
#> # A tibble: 4 x 2
#> group_id gender
#> <int> <chr>
#> 1 1 Male
#> 2 2 Both
#> 3 3 Male
#> 4 4 Both
Created on 2020-03-06 by the reprex package (v0.3.0)
Upvotes: 1