Reputation: 459
I have a dataframe, users_full
, in which I would like to create a new variable, party_followers
, that has the following categories: AFD, SPD, CDU, FDP, Linke, Multiple, or Other.
I would like to create the categories for the new variable by matching users_full$user_id
to the user_id
variable of the respective dataframe of followers, e.g., afd_followers
, spd_followers
, etc.
The problem is that users_full$user_id
can be a follower of multiple parties, and I am not sure how to account for this using an ifelse statement.
I tried the following, but it's not working.
mutate(users_full, party_followers = ifelse(user_id == afd_followers$user_id & user_id != cdu_followers$user_id & user_id != spd_followers$user_id & user_id != linke_followers$user_id & user_id != fdp_followers$user_id, "AfD",
user_id == cdu_followers$user_id & user_id != afd_followers$user_id & user_id != spd_followers$user_id & user_id != linke_followers$user_id & user_id != fdp_followers$user_id, "CDU",
user_id == spd_followers$user_id & user_id != cdu_followers$user_id & user_id != afd_followers$user_id & user_id != linke_followers$user_id & user_id != fdp_followers$user_id, "SPD",
user_id == linke_followers$user_id & user_id != cdu_followers$user_id & user_id != afd_followers$user_id & user_id != spd_followers$user_id & user_id != fdp_followers$user_id, "Linke",
user_id == fdp_followers$user_id & user_id != cdu_followers$user_id & user_id != afd_followers$user_id & user_id != spd_followers$user_id & user_id != linke_followers$user_id, "FDP",
user_id == afd_followers$user_id & cdu_followers$user_id & spd_followers$user_id & linke_followers$user_id & fdp_followers$user_id, "Multiple", "Other"))
Below, I reconstructed samples of the dataframes I am using.
users_full <- data.frame(
user_id = c("3854371132", "883470465498587138", "145216962", "2223089418",
"2861583057", "271413649"))
spd_followers <- data.frame(
user_id = c("145216962", "3864655101", "757305123165069312", "4854498122",
"1201495387", "565422099"))
afd_followers <- data.frame(
user_id = c("3854371132", "883470465498587138", "845969869778685952", "3864655101",
"757305123165069312", "793677341042044928"))
cdu_followers <- data.frame(
user_id = c("3854371132", "145216962", "3864655101", "757305123165069312",
"3207639056", "4854498122"))
linke_followers <- data.frame(
user_id = c("47289872", "1044855103", "565082298",
"956148596042330112", "2490464967", "956147739951329280"))
I would like to end up with the following output:
user_id party_followers
883470465498587000 AfD
3854371132 Multiple
1044855103 Linke
757305123165069000 Multiple
3207639056 SPD
947682953 Other
Upvotes: 0
Views: 164
Reputation: 3943
Here is a functional approach using only base R and which can be easily expanded by adding to the list of followers/vector of party names. In the past I'd have used sapply()
but it is advised to use vapply()
in functions because the format of its output is more predictable.
# Define function to get party name for each user ID
# by checking membership in each party.
get_party <- function(user_id,
id_list = followers_by_party,
id_labels = party_names) {
in_party <- vapply(id_list, function(x) user_id %in% x$user_id, logical(1))
if (sum(in_party) == 0) return('Other')
if (sum(in_party) > 1) return('Multiple')
return(id_labels[in_party])
}
# create list of user IDs by party and vector of name labels.
followers_by_party <- list(spd_followers, afd_followers, cdu_followers, linke_followers)
party_names <- c('SPD','AfD','CDU','Linke')
# apply this function to each of the user IDs
users_full$party_followers <- vapply(users_full$user_id, get_party, character(1))
Upvotes: 1
Reputation: 145755
To be scalable without copy/pasting in case you have additional follower
data frames, I would put all the followers in a list
, collapse them to a single data frame, and use a merge
.
Using data.table
:
followers = list(spd = spd_followers, afd = afd_followers, cdu = cdu_followers, linke = linke_followers)
foll = data.table::rbindlist(followers, idcol = "party_followers")
setkey(foll, "user_id")
foll[, user_id := as.character(user_id)]
setDT(users_full, key = "user_id")
users_full[, user_id := as.character(user_id)]
foll[, n := .N, by = user_id]
foll[n > 1, party_followers := "multiple"]
foll = unique(foll)
merge(users_full, foll, all = TRUE)
# user_id party_followers n
# 1: 1044855103 linke 1
# 2: 1201495387 spd 1
# 3: 145216962 multiple 2
# 4: 2223089418 NA NA
# 5: 2490464967 linke 1
# 6: 271413649 NA NA
# 7: 2861583057 NA NA
# 8: 3207639056 cdu 1
...
I'm not sure what merge
behavior you want. Use all = TRUE
to include all users in any of the data frames. Use all.x = TRUE
to restrict to only the users in users_full
, or all = FALSE
to do an inner join, only the users that are both in users_full
and in at least one of the follower
data frames. I left in the n
column showing how many networks each user is in - you can remove it if you want.
Upvotes: 2
Reputation: 2541
Not using ifelse, just vector comparison from base R. The multiple gets checked last, so it overwrites the places that would have other party names.
spd <- users_full$user_id %in% spd_followers$user_id
afd <- users_full$user_id %in% afd_followers$user_id
cdu <- users_full$user_id %in% cdu_followers$user_id
linke <- users_full$user_id %in% linke_followers$user_id
multiple <- (afd + cdu + spd + linke) > 1
other <- (afd + cdu + spd + linke) == 0
party_followers <- rep(NA, length(spd))
party_followers[spd] <- "SPD"; party_followers[afd] <- "AFD"
party_followers[cdu] <- "CDU"; party_followers[linke] <- "Linke"
party_followers[multiple] <- "Multiple"; party_followers[other] <- "Other"
users_full$party_followers <- party_followers
Upvotes: 0