Reputation: 69
library(data.table)
dt <- fread(" ID DATE
A1 20170220
A1 20170308
A1 20170311
A1 20170410
A1 20170411
A1 20170413
A1 20170415
A1 20170416
A1 20170420
A1 20170430
A2 20170120
A2 20170121
A2 20170123
A2 20170125
A2 20170202 ")
and trying to count N like this :
ID DATE count30day(count rows until after 30day)
A1 20170220 3 (count row until 20170322)
A1 20170308 2 (count row until 20170407)
A1 20170311 2 (count row until 20170410)
A1 20170410 7 (count row until 20170510)
A1 20170411 6 (count row until 20170511)
A1 20170413 5 (count row until 20170513)
A1 20170415 4 (count row until 20170514)
A1 20170416 3 (count row until 20170516)
A1 20170420 2 (count row until 20170520)
A1 20170430 1 (count row until 20170530)
A2 20170120 5 (count row until 20170220)
A2 20170121 4 (count row until 20170220)
A2 20170123 3 (count row until 20170220)
A2 20170125 2 (count row until 20170220)
A2 20170202 1 (count row until 20170220)
I tried this
dt[,N:=sapply(DATE, function(x) nrow(dt[x<=DATE&DATE < (x + months(1))]))]
It was work but last 5 value is wrong. It is gotta be 54321 but result was 55432.
and actuall data that I handling is about 2500000 rows so it takes so long time
is anyway that can reduce time and fix last value problem?
Upvotes: 1
Views: 114
Reputation: 389265
I think we need to keep an additional check on the current row number.
Using data.table:
library(data.table)
library(lubridate)
dt[, DATE := ymd(DATE) # convert 'DATE' to Date format by reference
][, row := .I # Add row number using inbuilt var '.I' by reference
][ , N := mapply(function(x, y)
sum(x <= DATE & DATE < (x + months(1)) & y <= row), DATE, row)]
OR using tidyverse
:
library(tidyverse)
library(lubridate)
dt %>%
mutate(DATE = ymd(DATE),
row = row_number(),
N = map2_dbl(DATE, row,
~ sum(.x <= DATE & DATE < (.x + months(1)) & .y <= row))) %>%
select(-row)
# ID DATE N
#1 A1 2017-02-20 3
#2 A1 2017-03-08 2
#3 A1 2017-03-11 2
#4 A1 2017-04-10 7
#5 A1 2017-04-11 6
#6 A1 2017-04-13 5
#7 A1 2017-04-15 4
#8 A1 2017-04-16 3
#9 A1 2017-04-20 2
#10 A1 2017-04-30 1
#11 A2 2017-01-20 5
#12 A2 2017-01-21 4
#13 A2 2017-01-23 3
#14 A2 2017-01-25 2
#15 A2 2017-02-02 1
Upvotes: 1
Reputation: 25223
using non-equi self-join:
dt[, N :=
dt[.(ID=ID, stt=DATE, end=DATE+30), on=.(ID, DATE>=stt, DATE<=end), .N, by=.EACHI]$N
]
output:
ID DATE N
1: A1 2017-02-20 3
2: A1 2017-03-08 2
3: A1 2017-03-11 2
4: A1 2017-04-10 7
5: A1 2017-04-11 6
6: A1 2017-04-13 5
7: A1 2017-04-15 4
8: A1 2017-04-16 3
9: A1 2017-04-20 2
10: A1 2017-04-30 1
11: A2 2017-01-20 5
12: A2 2017-01-21 4
13: A2 2017-01-23 3
14: A2 2017-01-25 2
15: A2 2017-02-02 1
data:
library(data.table)
dt <- fread(" ID DATE
A1 20170220
A1 20170308
A1 20170311
A1 20170410
A1 20170411
A1 20170413
A1 20170415
A1 20170416
A1 20170420
A1 20170430
A2 20170120
A2 20170121
A2 20170123
A2 20170125
A2 20170202 ")
dt[, DATE := as.Date(as.character(DATE), "%Y%m%d")]
Upvotes: 2
Reputation: 27792
Another data.table solution
#set strings to actual dates
dt[, DATE := lubridate::ymd( DATE ) ]
#set key for the join
setkey(dt, DATE)
#join, suspend output until we calculated the number of 'hits' per row (.EACHI)
dt[dt, N := {
val = dt[ ID == i.ID & DATE %between% c( i.DATE, i.DATE + 30 ) ];
list( nrow( val ) )
}, by = .EACHI]
output
# ID DATE N
# 1: A2 2017-01-20 5
# 2: A2 2017-01-21 4
# 3: A2 2017-01-23 3
# 4: A2 2017-01-25 2
# 5: A2 2017-02-02 1
# 6: A1 2017-02-20 3
# 7: A1 2017-03-08 2
# 8: A1 2017-03-11 2
# 9: A1 2017-04-10 7
# 10: A1 2017-04-11 6
# 11: A1 2017-04-13 5
# 12: A1 2017-04-15 4
# 13: A1 2017-04-16 3
# 14: A1 2017-04-20 2
# 15: A1 2017-04-30 1
benchmarks
# Unit: milliseconds
# expr min lq mean median uq max neval
# data.table_wimpel 10.51381 10.73975 11.41636 11.32511 11.89540 13.31526 10
# data.table_ronak 25.42636 25.56223 27.39190 26.46919 29.55910 32.10598 10
# tidyverse_ronak 28.09526 28.73364 30.30307 28.98098 29.45968 38.50784 10
microbenchmark::microbenchmark(
data.table_wimpel = {
dt = copy(DT)
dt[, DATE := lubridate::ymd( DATE ) ]
setkey(dt, DATE)
dt[dt, N := {
val = dt[ ID == i.ID & DATE %between% c( i.DATE, i.DATE + 30 ) ];
list( nrow( val ) )
}, by = .EACHI ] },
data.table_ronak = {
dt = copy(DT)
dt$DATE <- ymd(dt$DATE) #Convert to date
dt$row <- 1:nrow(dt) #Add row number
dt[ , N:= mapply(function(x, y)
sum(x <= DATE & DATE < (x + months(1)) & y <= row), DATE, row)]
},
tidyverse_ronak = {
dt = copy(DT)
dt %>%
mutate(DATE = ymd(DATE),
row = row_number(),
N = map2_dbl(DATE, row,
~ sum(.x <= DATE & DATE < (.x + months(1)) & .y <= row))) %>%
select(-row)
},
times = 10 )
Upvotes: 2