Reputation: 4366
I have a data set for multiple Dates. I want to lag the value of Cells
by one month. I probably cannot use shift()
because each month has different number of days (not to mention there are also some missing dates).
What I do is create a new data table with the unique Year
and Month
, shift/lag the Cells
, and then merge it with the original data table (being careful not to have duplicate columns).
This is not efficient, obviously. Is there an other way to do it?
sapply(c('data.table', 'lubridate'), require, character.only = TRUE)
DT <- fread('DATE, ID, Cells
2000-01-01, 1, 10
2000-01-02, 1, 10
2000-01-03, 1, 10
2000-01-01, 2, 20
2000-01-02, 2, 20
2000-01-03, 2, 20
2000-01-04, 2, 20
2000-02-01, 1, 30
2000-02-02, 1, 30
2000-02-01, 2, 40
2000-02-03, 2, 40
2000-02-04, 2, 40
2000-03-01, 1, 50
2000-03-02, 1, 50
2000-03-01, 2, 60
2000-03-03, 2, 60
')
DT[, date := as.Date(DATE, format = '%Y-%m-%d')][,
c('Year', 'Month') := .(year(date), month(date))]
setkey(DT, Year, Month, ID)
DT.Months <- DT[which(!duplicated(DT))][,
.(Year, Month, ID, Cells)]
DT.Months[, `:=`(Lagged.Cells =
shift(Cells, 1L, type = 'lag')), by = .(ID)]
DT <- DT[DT.Months][, `:=`(i.Cells, NULL)]
# > DT # This is what I want.
# The Value in Cells is lagged by one month,
# regardless of the number of observations within a month for each ID.
# DATE ID Cells date Year Month Lagged.Cells
# 1: 2000-01-01 1 10 2000-01-01 2000 1 NA
# 2: 2000-01-02 1 10 2000-01-02 2000 1 NA
# 3: 2000-01-03 1 10 2000-01-03 2000 1 NA
# 4: 2000-01-01 2 20 2000-01-01 2000 1 NA
# 5: 2000-01-02 2 20 2000-01-02 2000 1 NA
# 6: 2000-01-03 2 20 2000-01-03 2000 1 NA
# 7: 2000-01-04 2 20 2000-01-04 2000 1 NA
# 8: 2000-02-01 1 30 2000-02-01 2000 2 10
# 9: 2000-02-02 1 30 2000-02-02 2000 2 10
#10: 2000-02-01 2 40 2000-02-01 2000 2 10
#11: 2000-02-03 2 40 2000-02-03 2000 2 20
#12: 2000-02-04 2 40 2000-02-04 2000 2 20
#13: 2000-03-01 1 50 2000-03-01 2000 3 20
#14: 2000-03-02 1 50 2000-03-02 2000 3 20
#15: 2000-03-01 2 60 2000-03-01 2000 3 30
#16: 2000-03-03 2 60 2000-03-03 2000 3 30
Upvotes: 2
Views: 2321
Reputation: 23200
# Replace your sapply usage with pacman and you'll thank me
# pacman installs if needed, loads, and doesn't require quotation marks
pacman::p_load(data.table, lubridate)
DT <- fread('DATE, ID, Cells
2000-01-01, 1, 10
2000-01-02, 1, 10
2000-01-03, 1, 10
2000-01-01, 2, 20
2000-01-02, 2, 20
2000-01-03, 2, 20
2000-01-04, 2, 20
2000-02-01, 1, 30
2000-02-02, 1, 30
2000-02-01, 2, 40
2000-02-03, 2, 40
2000-02-04, 2, 40
2000-03-01, 1, 50
2000-03-02, 1, 50
2000-03-01, 2, 60
2000-03-03, 2, 60
')
DT$date <- ymd(DT$DATE)
DT$month <- format((DT$date), "%b")
lag.cells <- as.vector(capture.output(cat(rep("NA", length(DT$month[DT$month == "Jan"])), DT$Cells)))
lag.cells <- strsplit(lag.cells, "\\s+")[[1]]
lag.cells <- lag.cells[1:nrow(DT)]
DT$lag.cells <- lag.cells
DT
DATE ID Cells date month lag.cells
1: 2000-01-01 1 10 2000-01-01 Jan NA
2: 2000-01-02 1 10 2000-01-02 Jan NA
3: 2000-01-03 1 10 2000-01-03 Jan NA
4: 2000-01-01 2 20 2000-01-01 Jan NA
5: 2000-01-02 2 20 2000-01-02 Jan NA
6: 2000-01-03 2 20 2000-01-03 Jan NA
7: 2000-01-04 2 20 2000-01-04 Jan NA
8: 2000-02-01 1 30 2000-02-01 Feb 10
9: 2000-02-02 1 30 2000-02-02 Feb 10
10: 2000-02-01 2 40 2000-02-01 Feb 10
11: 2000-02-03 2 40 2000-02-03 Feb 20
12: 2000-02-04 2 40 2000-02-04 Feb 20
13: 2000-03-01 1 50 2000-03-01 Mar 20
14: 2000-03-02 1 50 2000-03-02 Mar 20
15: 2000-03-01 2 60 2000-03-01 Mar 30
16: 2000-03-03 2 60 2000-03-03 Mar 30
Upvotes: 2
Reputation: 6740
Date
class supports seq
by "month"
, "quarter"
, "year"
, etc.
Not so elegant but you can do something like this.
library(magrittr)
DT[, DATE := as.Date(DATE)]
DT[, DATE_lag := sapply(DATE, function(x)
seq(x, by = "1 month", length.out = 2)[2]) %>%
as.Date(origin = "1970-01-01")]
DT2 <- DT[, .(DATE_lag, ID, Cells)]
setnames(DT2, c("DATE_lag", "Cells"), c("DATE", "Lagged.Cells"))
merge(DT, DT2, by = c("DATE", "ID"), all.x = TRUE)
DATE ID Cells date month lag.cells DATE_lag Lagged.Cells
1: 2000-01-01 1 10 2000-01-01 Jan NA 2000-02-01 NA
2: 2000-01-01 2 20 2000-01-01 Jan NA 2000-02-01 NA
3: 2000-01-02 1 10 2000-01-02 Jan NA 2000-02-02 NA
4: 2000-01-02 2 20 2000-01-02 Jan NA 2000-02-02 NA
5: 2000-01-03 1 10 2000-01-03 Jan NA 2000-02-03 NA
6: 2000-01-03 2 20 2000-01-03 Jan NA 2000-02-03 NA
7: 2000-01-04 2 20 2000-01-04 Jan NA 2000-02-04 NA
8: 2000-02-01 1 30 2000-02-01 Feb 10 2000-03-01 10
9: 2000-02-01 2 40 2000-02-01 Feb 10 2000-03-01 20
10: 2000-02-02 1 30 2000-02-02 Feb 10 2000-03-02 10
11: 2000-02-03 2 40 2000-02-03 Feb 20 2000-03-03 20
12: 2000-02-04 2 40 2000-02-04 Feb 20 2000-03-04 20
13: 2000-03-01 1 50 2000-03-01 Mar 20 2000-04-01 30
14: 2000-03-01 2 60 2000-03-01 Mar 30 2000-04-01 40
15: 2000-03-02 1 50 2000-03-02 Mar 20 2000-04-02 30
16: 2000-03-03 2 60 2000-03-03 Mar 30 2000-04-03 40
>
Upvotes: 1