stats_noob
stats_noob

Reputation: 5907

R: Creating a Function For Calculating Conditional Probabilities

I am working with the R programming language.

I have the following data - suppose this contains the "exam results" for different students (same ID corresponds to the same student) taken at different times:

id = sample.int(10000, 100000, replace = TRUE)
res = c("PASS", "FAIL")
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)

my_data = data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]

      id results date_exam_taken
43894  1    FAIL      2001-06-18
31309  1    FAIL      2001-10-21
1996   1    FAIL      2004-08-21
76256  1    PASS      2004-10-13
14043  1    PASS      2005-05-11
38423  1    FAIL      2006-06-10

I want to answer the following question - based on this data, given a that student failed their 3rd exam, what is the probability that a student will pass their 4th exam and what is the probability that this student will fail their 4th exam?

In other words - given the result of the nth exam, what is the probability of pass/fail their n+1 th exam?

I tried to answer this in the following way:

my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL

third_exam = my_data[which(my_data$exam_number == 3), ]
third_exam = third_exam[which(third_exam$results == "FAIL"), ]
fourth_exam = my_data[which(my_data$exam_number == 4), ]

merged = merge(x = third_exam, y = fourth_exam, by = "id", all = TRUE)
merged = na.omit(merged)

pass = merged[merged$results.x == 'FAIL' & merged$results.y  == "PASS", ]
fail = merged[merged$results.x == 'FAIL' & merged$results.y  == "FAIL", ]

pass_prob = nrow(pass)/(nrow(pass) + nrow(fail))
fail_prob = nrow(fail)/(nrow(pass) + nrow(fail))

I tried to make this into a function for the future:

my_function <- function(current_exam, next_exam, result_of_current_exam)
    
{

my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
    
    c_exam = my_data[which(my_data$exam_number == current_exam), ]
    c_exam = c_exam[which(c_exam$results == result_of_current_exam), ]
    n_exam = my_data[which(my_data$exam_number == next_exam), ]
    
    merged = merge(x = c_exam, y = n_exam, by = "id", all = TRUE)
    merged = na.omit(merged)
    
    pass = merged[merged$results.x == result_of_current_exam & merged$results.y  == "PASS", ]
    fail = merged[merged$results.x == result_of_current_exam & merged$results.y  == "FAIL", ]
    
    pass_prob = nrow(pass)/(nrow(pass) + nrow(fail))
    fail_prob = nrow(fail)/(nrow(pass) + nrow(fail))
    
    return(c(pass_prob, fail_prob))
    
}

Now to call the function - given a student passed the third exam, what are the probabilities of passing and failing the fourth exam?

> my_function("3","4", "PASS")
[1] 0.5126595 0.4873405

Is there a quick way to apply my function (assuming I have written this function correctly) for all these combinations?

Can someone please show me how to do this correctly?

Thanks!

Upvotes: 1

Views: 113

Answers (3)

jblood94
jblood94

Reputation: 16981

Using data.table, we can simply shift the results and aggregate by exam number. Here is a function that will allow you to specify how many tests to look back using the lag argument.

library(data.table)

fExams <- function(dt, lag = 1L) {
  nms <- c("exam_num", paste0("prev", lag:1))
  dt2 <- setorderv(
    dt[
      ,(nms) := c(.(1:.N), lapply(lag:1, function(i) shift(results, i))), id
    ][
      ,.(prob_pass = mean(results == "PASS"), samples = .N),
      # exam_num:prev1
      nms
    ],
    nms
  )
  dt[,(nms) := NULL]
  dt2
}

Get the first 15 rows of the table that looks three exams back.

fExams(dt, 3)[1:15]
#>     exam_num prev3 prev2 prev1 prob_pass samples
#>  1:        1  <NA>  <NA>  <NA> 0.4950495    9999
#>  2:        2  <NA>  <NA>  FAIL 0.5031708    5046
#>  3:        2  <NA>  <NA>  PASS 0.5014141    4950
#>  4:        3  <NA>  FAIL  FAIL 0.5063949    2502
#>  5:        3  <NA>  FAIL  PASS 0.4922894    2529
#>  6:        3  <NA>  PASS  FAIL 0.5002030    2463
#>  7:        3  <NA>  PASS  PASS 0.4915391    2482
#>  8:        4  FAIL  FAIL  FAIL 0.4861338    1226
#>  9:        4  FAIL  FAIL  PASS 0.4904459    1256
#> 10:        4  FAIL  PASS  FAIL 0.5094192    1274
#> 11:        4  FAIL  PASS  PASS 0.5169355    1240
#> 12:        4  PASS  FAIL  FAIL 0.5000000    1222
#> 13:        4  PASS  FAIL  PASS 0.4930271    1219
#> 14:        4  PASS  PASS  FAIL 0.4763432    1247
#> 15:        4  PASS  PASS  PASS 0.5282392    1204

We can see from row 12 that exactly 50% of the 1222 students who passed their first exam but failed their second and third also failed their fourth exam.

For more general queries, here is a function that returns p(y|x), where x is the results of exams to condition on and y is the results of exams of interest.

pExam <- function(dt, x, y) {
  yRes <- c("FAIL", "PASS")[sign(y)/2 + 1.5]
  i <- with(
    rle(
      sort(
        dt[,exam_num := 1:.N, id][
          .(
            exam_num = abs(x),
            results = c("FAIL", "PASS")[sign(x)/2 + 1.5]
          ),
          on = .(exam_num = exam_num, results = results)
        ]$id
      )
    ),
    values[lengths == length(x)]
  )
  mean(dt[id %in% i & exam_num %in% y][,identical(results, yRes), id][[2]])
}

As with the previous example, the proportion of students who passed their fourth exam given that they passed their first exam but failed their second and third exams is:

pExam(dt, c(1, -2, -3), 4)
#> [1] 0.5

Here a negative index in x or y indicates a failed exam.

Data:

set.seed(1238818837)

dt <- setkey(
  data.table(
    id = sample.int(10000, 100000, replace = TRUE),
    results = sample(c("PASS", "FAIL"), 100000, replace = TRUE),
    date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
  ),
  id, date_exam_taken
)

Upvotes: 1

zephryl
zephryl

Reputation: 17079

For your second question, here’s a function that takes multiple conditions, passed as a named vector:

set.seed(13)
library(dplyr)
library(purrr)

conditional_rates <- function(conditions, next_exam) {
  conditioned <- my_data %>%
    group_by(id) %>%
    filter(
      all(map2_lgl(
        as.numeric(names(conditions)),
        conditions,
        ~ any(row_number() == .x & results == .y)
      )),
      row_number() == next_exam
    ) %>%
    ungroup()
    
  p_PASS <- mean(conditioned$results == "PASS")
  
  c(p_PASS = p_PASS, p_FAIL = 1 - p_PASS)
}

# rates for exam 4 conditional on passing exam 3
conditional_rates(c("3" = "PASS"), 4)
#    p_PASS    p_FAIL 
# 0.4908248 0.5091752 

# rates for exam 4 conditional on failing exams 1 
# and 3 and passing exam 2
conditional_rates(
  c("1" = "FAIL", "2" = "PASS", "3" = "FAIL"),
  4
)
#    p_PASS    p_FAIL 
# 0.4946483 0.5053517 

Upvotes: 1

chris jude
chris jude

Reputation: 498

I attempted your first query regarding running the function for all consecutive combinations. Hope this is useful

current<-unique(my_data$exam_number)
next_ex<-current[-1]
current<-current[-length(current)]

library(tidyverse)
pmap(list(.x=current,.y=next_ex),
     ~my_function(current_exam=.x,
                  next_exam=.y,
                  result_of_current_exam="PASS"))

pmap(list(.x=current,.y=next_ex),
     ~my_function(current_exam=.x,
                  next_exam=.y,
                  result_of_current_exam="FAIL"))

Upvotes: 2

Related Questions