Reputation: 3087
I have a simple table with emp_id
and job_code
. I would like to return the correct payout
based on the job_code
I've managed this with nested ifelse's but what if I have more job_code
's?
library(dplyr)
set.seed(1)
emp_id <- round(rnorm(100, 500000, 10000))
job_code <- sample(c('a', 'b', 'c'), 100, replace = TRUE)
result <- sample(c(1,2,3,4), 100, replace = TRUE)
df <- data.frame(emp_id = emp_id, job_code = job_code, result = result)
job_a <- c(0, 500, 1000, 5000)
job_b <- c(0, 200, 500, 750)
job_c <- c(0, 250, 750, 1000)
# Works but sucky
df %>% mutate(payout = ifelse(job_code == 'a', job_a[result],
ifelse(job_code == 'b', job_b[result],
job_c[result])))
and dput
if you prefer:
structure(list(emp_id = c(493735, 501836, 491644, 515953, 503295,
491795, 504874, 507383, 505758, 496946, 515118, 503898, 493788,
477853, 511249, 499551, 499838, 509438, 508212, 505939, 509190,
507821, 500746, 480106, 506198, 499439, 498442, 485292, 495218,
504179, 513587, 498972, 503877, 499462, 486229, 495850, 496057,
499407, 511000, 507632, 498355, 497466, 506970, 505567, 493112,
492925, 503646, 507685, 498877, 508811, 503981, 493880, 503411,
488706, 514330, 519804, 496328, 489559, 505697, 498649, 524016,
499608, 506897, 500280, 492567, 501888, 481950, 514656, 501533,
521726, 504755, 492901, 506107, 490659, 487464, 502914, 495567,
500011, 500743, 494105, 494313, 498648, 511781, 484764, 505939,
503330, 510631, 496958, 503700, 502671, 494575, 512079, 511604,
507002, 515868, 505585, 487234, 494267, 487754, 495266), job_code = structure(c(1L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 2L,
3L, 3L, 2L, 1L, 1L, 1L, 2L, 3L, 2L, 1L, 1L, 2L, 3L, 2L, 1L, 2L,
2L, 2L, 3L, 3L, 2L, 2L, 2L, 1L, 2L, 3L, 1L, 2L, 1L, 2L, 1L, 2L,
3L, 3L, 3L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 3L, 2L, 1L, 1L, 3L, 3L,
1L, 1L, 3L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 3L, 1L,
2L, 3L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 1L, 2L, 3L, 1L,
1L, 1L, 3L), .Label = c("a", "b", "c"), class = "factor"), result = c(3,
1, 2, 2, 2, 4, 1, 4, 1, 2, 1, 1, 4, 3, 2, 2, 1, 2, 4, 3, 3, 2,
2, 4, 4, 4, 4, 4, 2, 4, 4, 2, 2, 4, 1, 2, 2, 1, 3, 4, 4, 1, 3,
2, 3, 2, 2, 1, 2, 3, 2, 1, 4, 2, 4, 2, 4, 1, 4, 2, 1, 2, 4, 2,
3, 4, 1, 3, 3, 2, 2, 3, 4, 1, 1, 2, 2, 4, 1, 2, 2, 3, 3, 4, 1,
1, 4, 4, 1, 4, 1, 1, 4, 3, 1, 2, 3, 2, 2, 1)), .Names = c("emp_id",
"job_code", "result"), row.names = c(NA, -100L), class = "data.frame")
What I'd like to do ideally is have the payouts within a data.frame but not sure how to reference it properly:
job_payouts <- data.frame(a = job_a, b = job_b, c = job_c)
# Won't work...
df %>% mutate(payout = job_payouts$job_code[result])
Upvotes: 3
Views: 652
Reputation: 38510
This can be achieved through the super cool method of matrix indexing in base R which is extremely fast and efficient.
# build jobs payout lookup matrix, by hand (see edit below for an extension)
jobs <- rbind(job_a, job_b, job_c)
# add row names to the matrix for convenient reference
rownames(jobs) <- levels(df$job_code)
# get payout using matrix indexing
df$payout <- jobs[cbind(df$job_code, df$result)]
This returns
# print out first 6 observations
head(df)
emp_id job_code result payout
1 493735 a 3 1000
2 501836 a 1 0
3 491644 b 2 200
4 515953 a 2 500
5 503295 a 2 500
6 491795 b 4 750
# print out jobs matrix for comparison
jobs
[,1] [,2] [,3] [,4]
a 0 500 1000 5000
b 0 200 500 750
c 0 250 750 1000
There are a couple of details that worth mentioning.
data.frame
function converts the job_code character vector, so that df$job_code
is a factor variable where labels are associated with the natural numbers 1, 2, 3, ... By default, levels of the factor are ordered alphabetically by label so, in this example, the label "a" corresponds to 1, "b" to 2, and "c" to 3. You can use the levels
function to find the order of the factor variable and construct the jobs matrix following that template.cbind(df$job_code, df$result)
forms a 2 by nrow(df)
(100) matrix which is used to look up the nrow(df)
payoff values for each employee from the jobs matrix using matrix indexing. The R intro manual has a nice intro section on matrix indexing and additional details can be found in help("[")
.Edit: Automating the construction of the lookup matrix
In the comments to this answer, the OP expresses concern that building the lookup matrix (which I called "jobs"), by hand would be tedious and prone to error. To address these valid concerns, we can use a somewhat obscure argument to the mget
function, "ifnotfound." This argument allows us to control the output of elements of the list that mget
returns when they are present in the vector of names, but not present in the environment.
In the comments, I suggested using NA
to fill in missing levels in the comment below. We can extend this by using NA
as the input for "ifnotfound."
Suppose df$job_code
is a factor that has levels "a", "aa", "b", and "c" in that order. Then we build the look up matrix as follows:
# build vector for example, the actual code, using levels(), follows as a comment
job_codes <- c("a", "aa", "b", "c") # job_codes <- levels(df$jobcodes)
# get ordered list of payouts, with NA for missing payouts
payoutList <- mget(paste0("job_", job_codes), ifnotfound=NA)
which returns a named list.
payoutList
$job_a
[1] 0 500 1000 5000
$job_aa
[1] NA
$job_b
[1] 0 200 500 750
$job_c
[1] 0 250 750 1000
Note that payoutList$job_aa
is a single NA. Now, build the matrix from this list.
# build lookup matrix using do.call() and rbind()
jobs.lookupMat <- do.call(rbind, payoutList)
jobs.lookupMat
[,1] [,2] [,3] [,4]
job_a 0 500 1000 5000
job_aa NA NA NA NA
job_b 0 200 500 750
job_c 0 250 750 1000
The rows of the matrix are properly ordered according to the levels of the factor df$job_code
, conveniently named, and NA
s fill in rows wherever there is no payout.
Upvotes: 10
Reputation: 2022
Using tools from tidyverse:
library(dplyr)
library(stringr)
library(tidyr)
# your data
set.seed(1)
emp_id <- round(rnorm(100, 500000, 10000))
job_code <- sample(c('a', 'b', 'c'), 100, replace = TRUE)
result <- sample(c(1,2,3,4), 100, replace = TRUE)
# construct a data frame
df <-
data.frame(emp_id = emp_id,
job_code = job_code,
result = result,
stringsAsFactors = FALSE)
# your jobs
job_a <- c(0, 500, 1000, 5000)
job_b <- c(0, 200, 500, 750)
job_c <- c(0, 250, 750, 1000)
# construct a data frame
my_job <-
data.frame(job_a, job_b, job_c) %>%
gather(job, value) %>%
group_by(job) %>%
mutate(result = 1:n(),
job_code = str_replace(job, "job_", "")) %>%
ungroup %>%
select(-job)
# join df and my_job into my_results table
my_results <-
left_join(df, my_job)
Results:
my_results %>% tbl_df
Source: local data frame [100 x 4]
emp_id job_code result value
(dbl) (chr) (dbl) (dbl)
1 493735 a 3 1000
2 501836 a 1 0
3 491644 b 2 200
4 515953 a 2 500
5 503295 a 2 500
6 491795 b 4 750
7 504874 b 1 0
8 507383 a 4 5000
9 505758 a 1 0
10 496946 c 2 250
.. ... ... ... ...
Upvotes: 3
Reputation: 6372
Without changing your data structure, you can do this by defining a function:
job_search <- function(code){
var_name <- paste0("job_",code)
if (exists(var_name)){
return(get(var_name))
}else{
return(NA)
}
}
library(data.table)
setDT(df)
df[, payout := job_search(job_code)[result], by = .(emp_id)]
df
emp_id job_code result payout
1: 493735 a 3 1000
2: 501836 a 1 0
3: 491644 b 2 200
4: 515953 a 2 500
5: 503295 a 2 500
6: 491795 b 4 750
7: 504874 b 1 0
8: 507383 a 4 5000
9: 505758 a 1 0
10: 496946 c 2 250
11: 515118 c 1 0
12: 503898 a 1 0
...
However, this is a fairly unstable way to keep your data, and the paste + get syntax is convoluted.
A better way to store your data would be in a lookup table:
library(data.table)
job_a <- data.frame(payout = c(0, 500, 1000, 5000))
job_b <- data.frame(payout = c(0, 200, 500, 750))
job_c <- data.frame(payout = c(0, 250, 750, 1000))
job_lookup <- rbindlist( #this is a data.table
l = list(a = job_a,b = job_b,c = job_c),
idcol = TRUE
)
# create your result index
job_lookup[, result := 1:.N, by = .id]
job_lookup
.id payout result
1: a 0 1
2: a 500 2
3: a 1000 3
4: a 5000 4
5: b 0 1
6: b 200 2
7: b 500 3
8: b 750 4
9: c 0 1
10: c 250 2
11: c 750 3
12: c 1000 4
# merge to your initial data.frame
merge(df, job_lookup, by.x = c("job_code","result"), by.y = c(".id","result"), all.x = TRUE)
job_code result emp_id payout
1 a 1 505758 0
2 a 1 501836 0
3 a 1 503898 0
4 a 1 494575 0
5 a 1 487464 0
6 a 1 503700 0
7 a 1 505939 0
8 a 1 503330 0
9 a 1 512079 0
10 a 1 481950 0
11 a 1 507685 0
12 a 1 490659 0
...
Upvotes: 3