Reputation: 356
Given a dataset of > 900,000 rows, of which length(duplicates) = >300,000
, the following loop takes appr 4h to run in R, which is unacceptable.
for(i in duplicates) {
couple_table <- filter(data, pnr == i) # filter patients
min_date <- min(couple_table$date) # determine date of first operation
max_date <- max(couple_table$date) # determine date of second operation
data$first[data$pnr == i & data$date == min_date] <- 1 # assign 1 to column first
data$second[data$pnr == i & data$date == max_date] <- 1 # assign 1 to column second
}
How can I tweak this code to run faster in R? I have had a look at *apply
but I am not familiar with it at all, any ideas?
Dummy data:
data <- data.frame(pnr = c('a43','a4945', 'a43', 'a231', 'a231', 'a6901'),
date = c(as.Date('2011-12-19'), as.Date('2012-09-11'), as.Date('2013-10-01'),
as.Date('2012-05-09'), as.Date('2009-09-10'), as.Date('2015-06-12')))
duplicates <- as.character(data$pnr[duplicated(data$pnr)])
Upvotes: 3
Views: 106
Reputation: 101099
A data.table
option
setDT(data)[
,
`:=`(
first = ifelse(min(date) == date & .N > 1, 1, NA_integer_),
second = ifelse(max(date) == date & .N > 1, 1, NA_integer_)
),
pnr
]
gives
pnr date first second
1: a43 2011-12-19 1 NA
2: a4945 2012-09-11 NA NA
3: a43 2013-10-01 NA 1
4: a231 2012-05-09 NA 1
5: a231 2009-09-10 1 NA
6: a6901 2015-06-12 NA NA
Upvotes: 1
Reputation: 76402
Here is a base R solution with ave
. It uses the trick in akrun's answer, that
NA^0 == 1
(More precisely, that NA^FALSE == NA^0 == 1
)
data$first <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == max(d))))
data$second <- with(data, ave(as.integer(date), pnr, FUN = function(d) NA^(d == min(d))))
data
# pnr date first second
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
Upvotes: 3
Reputation: 5232
With data.table
library(data.table)
setDT(data)
data[pnr %in% duplicates, ":="(
Min = (date == min(date)) * 1L,
Max = (date == max(date)) * 1L
), by = pnr
]
data[, c("Min", "Max") := lapply(.SD, function(x) ifelse(x == 0, NA, x)), .SDcols = c("Min", "Max")]
Upvotes: 3
Reputation: 886948
A group by operation would be more faster
library(dplyr)
data %>%
group_by(pnr) %>%
mutate(Min = if(n() > 1) NA^(date != min(date)) else NA,
Max = if(n() > 1) NA^(date != max(date)) else NA) %>%
ungroup
-output
# A tibble: 6 x 4
# pnr date Min Max
# <chr> <date> <dbl> <dbl>
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
Similar logic in data.table
would be
library(data.table)
setDT(data)[, c('Min', 'Max') := .(if(.N > 1)
NA^(date != min(date)) else NA, if(.N> 1)
NA^(date != max(date)) else NA), .(pnr)]
Or may use collapse
for faster execution
library(collapse)
data %>%
ftransform(n = fNobs(date, pnr, TRA = 'replace_fill')) %>%
ftransform(Min = NA^(fmin(date, pnr, TRA = "replace_fill") != date | n == 1),
Max = NA^(fmax(date, pnr, TRA = "replace_fill") != date | n == 1), n = NULL )
# pnr date Min Max
#1 a43 2011-12-19 1 NA
#2 a4945 2012-09-11 NA NA
#3 a43 2013-10-01 NA 1
#4 a231 2012-05-09 NA 1
#5 a231 2009-09-10 1 NA
#6 a6901 2015-06-12 NA NA
Or use base R
with duplicated
i1 <- with(data, duplicated(pnr)|duplicated(pnr, fromLast = TRUE))
data$Min <- with(data, i1 & date == ave(date, pnr, FUN = min))
data$Max <- with(data, i1 & date == ave(date, pnr, FUN = max))
Upvotes: 5