pha
pha

Reputation: 356

How to rewrite loop to run faster in R?

Given a dataset of > 900,000 rows, of which length(duplicates) = >300,000, the following loop takes appr 4h to run in R, which is unacceptable.

for(i in duplicates) {
  couple_table <- filter(data, pnr == i) # filter patients
  min_date <- min(couple_table$date)     # determine date of first operation
  max_date <- max(couple_table$date)     # determine date of second operation
  
  data$first[data$pnr == i & data$date == min_date] <- 1  # assign 1 to column first
  data$second[data$pnr == i & data$date == max_date] <- 1 # assign 1 to column second
}

How can I tweak this code to run faster in R? I have had a look at *apply but I am not familiar with it at all, any ideas?

Dummy data:

data <- data.frame(pnr = c('a43','a4945', 'a43', 'a231', 'a231', 'a6901'),
                date = c(as.Date('2011-12-19'), as.Date('2012-09-11'),  as.Date('2013-10-01'),
                as.Date('2012-05-09'), as.Date('2009-09-10'), as.Date('2015-06-12')))
duplicates <- as.character(data$pnr[duplicated(data$pnr)])

Upvotes: 3

Views: 106

Answers (4)

ThomasIsCoding
ThomasIsCoding

Reputation: 101099

A data.table option

setDT(data)[
  ,
  `:=`(
    first = ifelse(min(date) == date & .N > 1, 1, NA_integer_),
    second = ifelse(max(date) == date & .N > 1, 1, NA_integer_)
  ),
  pnr
]

gives

     pnr       date first second
1:   a43 2011-12-19     1     NA
2: a4945 2012-09-11    NA     NA
3:   a43 2013-10-01    NA      1
4:  a231 2012-05-09    NA      1
5:  a231 2009-09-10     1     NA
6: a6901 2015-06-12    NA     NA

Upvotes: 1

Rui Barradas
Rui Barradas

Reputation: 76402

Here is a base R solution with ave. It uses the trick in akrun's answer, that

NA^0 == 1

(More precisely, that NA^FALSE == NA^0 == 1)

data$first <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == max(d))))
data$second <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == min(d))))

data
#    pnr       date first second
#1   a43 2011-12-19     1     NA
#2 a4945 2012-09-11    NA     NA
#3   a43 2013-10-01    NA      1
#4  a231 2012-05-09    NA      1
#5  a231 2009-09-10     1     NA
#6 a6901 2015-06-12    NA     NA

Upvotes: 3

det
det

Reputation: 5232

With data.table

library(data.table)

setDT(data)
data[pnr %in% duplicates, ":="(
    Min = (date == min(date)) * 1L,
    Max = (date == max(date)) * 1L
  ), by = pnr
]
data[, c("Min", "Max") := lapply(.SD, function(x) ifelse(x == 0, NA, x)), .SDcols = c("Min", "Max")]

Upvotes: 3

akrun
akrun

Reputation: 886948

A group by operation would be more faster

library(dplyr)
data %>% 
   group_by(pnr) %>% 
   mutate(Min = if(n() > 1)  NA^(date != min(date)) else NA, 
          Max = if(n() > 1) NA^(date != max(date)) else NA) %>%
   ungroup

-output

# A tibble: 6 x 4
#  pnr   date         Min   Max
#  <chr> <date>     <dbl> <dbl>
#1 a43   2011-12-19     1    NA
#2 a4945 2012-09-11    NA    NA
#3 a43   2013-10-01    NA     1
#4 a231  2012-05-09    NA     1
#5 a231  2009-09-10     1    NA
#6 a6901 2015-06-12    NA    NA

Similar logic in data.table would be

library(data.table)
setDT(data)[, c('Min', 'Max') := .(if(.N > 1) 
    NA^(date != min(date)) else NA, if(.N> 1) 
        NA^(date != max(date)) else NA), .(pnr)]

Or may use collapse for faster execution

library(collapse)
data %>%
    ftransform(n = fNobs(date, pnr, TRA = 'replace_fill')) %>% 
    ftransform(Min = NA^(fmin(date, pnr, TRA = "replace_fill") != date | n == 1), 
    Max = NA^(fmax(date, pnr, TRA = "replace_fill") != date | n == 1), n = NULL )
#    pnr       date Min Max
#1   a43 2011-12-19   1  NA
#2 a4945 2012-09-11  NA  NA
#3   a43 2013-10-01  NA   1
#4  a231 2012-05-09  NA   1
#5  a231 2009-09-10   1  NA
#6 a6901 2015-06-12  NA  NA

Or use base R with duplicated

i1 <- with(data, duplicated(pnr)|duplicated(pnr, fromLast = TRUE))     
data$Min <- with(data, i1 & date == ave(date, pnr, FUN = min))
data$Max <- with(data, i1 & date == ave(date, pnr, FUN = max))

Upvotes: 5

Related Questions