Reputation: 225
I have a panel data set with observations for individuals over time. There are three columns: person, date, and value. I want to create three new variables. First: the number of unique entries in the value column for each person-day that occur within 1 year before the observation. Second: a set of dummies that reflect whether or not the person had each possible entry in the value column within the last year. Third: a set of count variables that count the number of times the person had each possible entry in the value column within the previous year.
I have a way to do this on a toy data set, but when I try to apply my method to a large data set, I run out of memory.
Is there a more efficient way to get these three variables?
My toy data set approach has 5 steps:
# In this example, "Species" = "person", "Sepal.Width" = "value"
# I use 3 days instead of 1 year
library(data.table)
library(zoo)
library(stringr)
set.seed(481516)
dt <- as.data.table(iris)
dt[, date := as.Date(seq_len(.N), format = "%Y"), by = Species]
dt[, sepal_width_above_3 := ifelse(Sepal.Width > 3, 1, 0)]
dt[, random_drop := runif(nrow(dt))]
dt <- dt[random_drop >= 0.1]
dt_lag <- dt[, .(Species, date, Sepal.Width)]
#step 1: lag the date - a new var for every species-date combo
dt_lag[, species_count := .N, by = Species]
keep_names <- names(dt_lag)[names(dt_lag) != "Species"]
dt_lag <- cbind(dt_lag[, ..keep_names],
dt_lag[, shift(date, n=1:max(dt_lag$species_count), give.names = T), by = Species],
dt_lag[, shift(Sepal.Width, n=1:max(dt_lag$species_count), give.names = T), by = Species])
#step 2: which values do we keep?
for (X in names(dt_lag)[names(dt_lag) %like% "^date_lag_"]){
Y <- gsub("date","Sepal.Width", X)
dt_lag[, (X) := ifelse(date - get(X) > 0 & date - get(X) <= 3, get(Y) , "OOR")]
dt_lag[, (Y) := NULL]
}
#step 3: paste together
dt_lag[, sepal_width_values_within_lag_3 := gsub('-NA|-OOR','',do.call(paste, c(.SD, sep="-"))), .SDcols = names(dt_lag) %like% "date_lag_"]
for (X in names(dt_lag)[names(dt_lag) %like% "^date_lag_"]){
dt_lag[, (X) := NULL]
}
#step 4: counts and dummies for each type
for (X in sort(unique(dt_lag$Sepal.Width))) {
# X = 1
spec_count = paste("sepal_width_count_lag_365_",X,sep="")
spec_dummy = paste("sepal_width_dummy_lag_365_",X,sep="")
pattern = as.character(X)
dt_lag[, (spec_count) := str_count(sepal_width_values_within_lag_3, eval(pattern))]
dt_lag[, (spec_dummy) := str_detect(sepal_width_values_within_lag_3, eval(pattern))]
}
#step 5: unique counts
dt_lag[, unique_sepal_width_values_within_lag_3_count := sapply(sepal_width_values_within_lag_3, function(x)
length( # count items
unique( # that are unique
scan( # when arguments are presented to scan as text
text=x, what="", sep ="-", # when separated by ","
quiet=TRUE))) )]
Here are the results for 1 entry in the value column (width = 2)
head(dt_lag[,.(date, Species, sepal_width_values_within_lag_3, sepal_width_count_lag_365_2, sepal_width_dummy_lag_365_2, unique_sepal_width_values_within_lag_3_count)])
date Species sepal_width_values_within_lag_3 sepal_width_count_lag_365_2 sepal_width_dummy_lag_365_2 unique_sepal_width_values_within_lag_3_count
1: 1970-09-14 setosa 3.5 0 FALSE 1
2: 1970-09-15 setosa 3-3.5 0 FALSE 2
3: 1970-09-16 setosa 3.2-3-3.5 1 TRUE 3
4: 1970-09-17 setosa 3.1-3.2-3 1 TRUE 3
5: 1970-09-18 setosa 3.6-3.1-3.2 1 TRUE 3
6: 1970-09-19 setosa 3.9-3.6-3.1 0 FALSE 3
Upvotes: 5
Views: 258
Reputation: 27732
This is only a partial answer, since I did not fully understand your 2nd and 3rd problem...
#create data.table with the correct names, based on your sample data (i think)
DT <- dt[, .(person = Species, date, value = Sepal.Width)]
#set keys
setkey(DT, person, date)
#create unique values of `value in the last year before the observation, for each `person
DT[ DT,
#get the unique values for the last year, suppress immediate output with {}
unique_values_prev_year := {
val = DT[ person == i.person & date %between% c( i.date - lubridate::years(1), i.date) ]$value
unique_val = sort( unique( val ) )
list( paste0( unique_val, collapse = "-" ) )
},
#do the above for each row
by = .EACHI ]
output
# person date value unique_values_prev_year
# 1: setosa 1970-09-14 3.5 3.5
# 2: setosa 1970-09-15 3.0 3-3.5
# 3: setosa 1970-09-16 3.2 3-3.2-3.5
# 4: setosa 1970-09-17 3.1 3-3.1-3.2-3.5
# 5: setosa 1970-09-19 3.9 3-3.1-3.2-3.5-3.9
# ---
# 133: virginica 1970-10-28 3.3 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
# 134: virginica 1970-10-29 3.0 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
# 135: virginica 1970-10-30 2.5 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
# 136: virginica 1970-10-31 3.0 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
# 137: virginica 1970-11-01 3.4 2.2-2.5-2.6-2.7-2.8-2.9-3-3.1-3.2-3.3-3.4-3.6-3.8
Upvotes: 4