jjulip
jjulip

Reputation: 1133

How to calculate means for increasing time windows in R

I have a dataframe with a number of different animals (a,b,c in below example data), transactionIDs, counts, and days. I would like to calculate the mean and standard deviation of count values for increasing time windows (specified by days) for each transactionID within each animal. i.e. for transactionID 1 of animal a, I would like to add columns for the means and SDs of i) days -1 to -2, ii) days -1 to -3, iii) days -1 to -4 and so on… so that I end up with 5 new columns with means of increasing time windows and 5 for SDs.

Example data:

> dput(df)
structure(list(Animal = c("a", "a", "a", "a", "a", "a", "a", "a", 
"a", "a", "a", "b", "b", "b", "b", "b", "b", "b", "c", "c", "c", 
"c", "c"), Count = c(45L, 54L, 22L, 3L, 23L, 46L, 45L, 22L, 67L, 
34L, 22L, 34L, 677L, 86L, 54L, 4L, 56L, 98L, 23L, 54L, 22L, 77L, 
23L), Day = c(-6L, -5L, -4L, -3L, -2L, -1L, -5L, -4L, -3L, -2L, 
-1L, -4L, -3L, -2L, -1L, -3L, -2L, -1L, -6L, -5L, -3L, -2L, -1L
), transactionID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L)), .Names = c("Animal", 
"Count", "Day", "transactionID"), class = "data.frame", row.names = c(NA, 
-23L))

> df
   Animal Count Day transactionID
1     a    45  -6             1
2     a    54  -5             1
3     a    22  -4             1
4     a     3  -3             1
5     a    23  -2             1
6     a    46  -1             1
7     a    45  -5             2
8     a    22  -4             2
9     a    67  -3             2
10    a    34  -2             2
11    a    22  -1             2
12    b    34  -4             3
13    b   677  -3             3
14    b    86  -2             3
15    b    54  -1             3
16    b     4  -3             4
17    b    56  -2             4
18    b    98  -1             4
19    c    23  -6             5
20    c    54  -5             5
21    c    22  -3             5
22    c    77  -2             5
23    c    23  -1             5

I can achieve my desired output using the code below. However, when cycling thorough my entire dataframe there are cases where I have fewer than 6 days for an animal, and this for loop does not add NAs in cases where the maximum no. days in the dataset (and thus the time window) is less than 6. I also have a few cases in my dataset where there is a day missing in the day column (i.e. animal c, day -4). In this case I want to add NAs for the means and sds for all time windows from the missing day onwards. See below for my desired output.

My attempt:

#create empty matrix
res2 = as.data.frame(matrix(NA,0,14))
#split by name
animal.list = split(df,df$Name)

#For loop for 
for(i in 1:length(animal.list)){
  a = as.data.frame(animal.list[[i]])
  animal = unique(a$Name)
  #create empty matrix
  res = as.data.frame(matrix(NA,0,14))

  #create list of event IDs
  event = split(a,a$transactionID)

  #loop through each event in turn and calculate the mean of different baseline periods (from 2 days to 6 days)- clunky!
  for(j in 1:length(event)){
    e = as.data.frame(event[[j]])
    #max day
    e$maxday = unique(e[1,]$Day)
    #save mean activity value for the 2 days prior to event
    e$mean2d = round(mean(e[e$Day >-3,]$Count),3)
    e$SD2d = round(sd(e[e$Day >-3,]$Count),3)
    #save mean activity value for the 3 days prior to event
    e$mean3d = round(mean(e[e$Day >-4,]$Count),3)
    e$SD3d = round(sd(e[e$Day >-4,]$Count),3)
    #save mean activity value for the 4 days prior to event
    e$mean4d = round(mean(e[e$Day >-5,]$Count),3)
    e$SD4d = round(sd(e[e$Day >-5,]$Count),3)
    #save mean activity value for the 5 days prior to event
    e$mean5d = round(mean(e[e$Day >-6,]$Count),3)
    e$SD5d = round(sd(e[e$Day >-6,]$Count),3)
    #save mean activity value for the 6 days prior to event
    e$mean6d = round(mean(e[e$Day >-7,]$Count),3)
    e$SD6d = round(sd(e[e$Day >-7,]$Count),3)
    res = rbind(res,e)
  }
  res2 = rbind(res2,res) 
}

Desired output:

>res2
   Name Count Day transactionID maxday mean2d   SD2d  mean3d    SD3d mean4d    SD4d mean5d   SD5d
1     a    45  -6             1     -6   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452
2     a    54  -5             1     -6   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452
3     a    22  -4             1     -6   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452
4     a     3  -3             1     -6   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452
5     a    23  -2             1     -6   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452
6     a    46  -1             1     -6   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452
7     a    45  -5             2     -5   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828
8     a    22  -4             2     -5   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828
9     a    67  -3             2     -5   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828
10    a    34  -2             2     -5   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828
11    a    22  -1             2     -5   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828
12    b    34  -4             3     -4   70.0 22.627 272.333 350.817 212.75 310.240     NA     NA
13    b   677  -3             3     -4   70.0 22.627 272.333 350.817 212.75 310.240     NA     NA
14    b    86  -2             3     -4   70.0 22.627 272.333 350.817 212.75 310.240     NA     NA
15    b    54  -1             3     -4   70.0 22.627 272.333 350.817 212.75 310.240     NA     NA
16    b     4  -3             4     -3   77.0 29.698  52.667  47.089     NA      NA     NA     NA
17    b    56  -2             4     -3   77.0 29.698  52.667  47.089     NA      NA     NA     NA
18    b    98  -1             4     -3   77.0 29.698  52.667  47.089     NA      NA     NA     NA
19    c    23  -6             5     -6   50.0 38.184      NA      NA     NA      NA     NA     NA
20    c    54  -5             5     -6   50.0 38.184      NA      NA     NA      NA     NA     NA
21    c    22  -3             5     -6   50.0 38.184      NA      NA     NA      NA     NA     NA
22    c    77  -2             5     -6   50.0 38.184      NA      NA     NA      NA     NA     NA
23    c    23  -1             5     -6   50.0 38.184      NA      NA     NA      NA     NA     NA
   mean6d   SD6d
1  32.167 19.343
2  32.167 19.343
3  32.167 19.343
4  32.167 19.343
5  32.167 19.343
6  32.167 19.343
7      NA     NA
8      NA     NA
9      NA     NA
10     NA     NA
11     NA     NA
12     NA     NA
13     NA     NA
14     NA     NA
15     NA     NA
16     NA     NA
17     NA     NA
18     NA     NA
19     NA     NA
20     NA     NA
21     NA     NA
22     NA     NA
23     NA     NA

Edit: based on @Henrik's suggestion (this is a much quicker way to calculate the cumulative means and ads but still does not account for cases where there is a missing day and use Has in these cases) - any simple suggestions would be much appreciated):

library(dplyr)
library(TTR)

#create empty matrix
res2 = as.data.frame(matrix(NA,0,14))
#split by name
animal.list = split(df,df$Name)

#For loop for 
for(i in 1:length(animal.list)){
  a = as.data.frame(animal.list[[i]])
  animal = unique(a$Name)
  #create empty matrix
  res = as.data.frame(matrix(NA,0,14))
  #create list of event IDs
  event = split(a,a$transactionID)

  #loop through each event in turn and calculate the mean of different baseline periods (from 2 days prior to 10 days prior)
  for(j in 1:length(event)){
    e = as.data.frame(event[[j]])
    #max day
    e$maxday = unique(e[1,]$Day)
    cmean = cummean(rev(e$Count))
    csd= runSD(rev(e$Count),n=1,cumulative=TRUE)
    e$mean2d = cmean[2]
    e$sd2d = csd[2]
    e$mean3d = cmean[3]
    e$sd3d = csd[3]
    e$mean4d = cmean[4]
    e$sd4d = csd[4]
    e$mean5d = cmean[5]
    e$sd5d = csd[5]
    e$mean6d = cmean[6]
    e$sd6d = csd[6]
    res = rbind(res,e)
  }
  res2 = rbind(res2,res) 
}

Upvotes: 2

Views: 129

Answers (2)

Parfait
Parfait

Reputation: 107652

Consider by to build a list of data frames with those rolling mean and sd calculations. Then row bind all data frame elements from list with do.call.

df_list <- by(df, df[c("Animal", "transactionID")], function(sub)
  transform(sub,
            max_day = min(sub$Day),
            mean = sapply(sub$Day, function(i) mean(sub[sub$Day >= i,]$Count)),
            sd = sapply(sub$Day, function(i) sd(sub[sub$Day >= i,]$Count))
  )
)
# BIND ALL DF ELEMENTS INTO ONE (FILTERING OUT NULL ELEMENTS) 
newdf <- do.call(rbind, Filter(NROW, df_list))

newdf
#    Animal Count Day transactionID max_day      mean         sd
# 1       a    45  -6             1      -6  32.16667  19.343388
# 2       a    54  -5             1      -6  29.60000  20.452384
# 3       a    22  -4             1      -6  23.50000  17.597348
# 4       a     3  -3             1      -6  24.00000  21.517435
# 5       a    23  -2             1      -6  34.50000  16.263456
# 6       a    46  -1             1      -6  46.00000         NA
# 7       a    45  -5             2      -5  38.00000  18.828170
# 8       a    22  -4             2      -5  36.25000  21.266170
# 9       a    67  -3             2      -5  41.00000  23.302360
# 10      a    34  -2             2      -5  28.00000   8.485281
# 11      a    22  -1             2      -5  22.00000         NA
# 12      b    34  -4             3      -4 212.75000 310.240095
# 13      b   677  -3             3      -4 272.33333 350.816666
# 14      b    86  -2             3      -4  70.00000  22.627417
# 15      b    54  -1             3      -4  54.00000         NA
# 16      b     4  -3             4      -3  52.66667  47.088569
# 17      b    56  -2             4      -3  77.00000  29.698485
# 18      b    98  -1             4      -3  98.00000         NA
# 19      c    23  -6             5      -6  39.80000  24.833445
# 20      c    54  -5             5      -6  44.00000  26.545558
# 21      c    22  -3             5      -6  40.66667  31.469562
# 22      c    77  -2             5      -6  50.00000  38.183766
# 23      c    23  -1             5      -6  23.00000         NA

Ideally, you would use above as the final output since it is in long format as many data analytical methods require this shape. However, if you really need the wide format, add a reshape and merge after building above:

rdf <- reshape(newdf, idvar = c("Animal", "transactionID", "max_day"), drop = c("Count"),
               v.names = c("sd", "mean"), timevar = "Day", direction="wide")
# MERGE ORIGINAL DATA FRAME
rdf <- merge(df, rdf, by=c("Animal", "transactionID"))

# CLEAN UP AND REORDER COLUMNS
names(rdf) <- gsub(".-", "_d", names(rdf))
rdf <- rdf[,c(1:5, rev(6:ncol(rdf)))]

rdf        
#    Animal transactionID Count Day max_day mean_d1 sd_d1 mean_d2     sd_d2   mean_d3     sd_d3 mean_d4     sd_d4 mean_d5    sd_d5  mean_d6    sd_d6
# 1       a             1    45  -6      -6      46    NA    34.5 16.263456  24.00000  21.51743   23.50  17.59735    29.6 20.45238 32.16667 19.34339
# 2       a             1    54  -5      -6      46    NA    34.5 16.263456  24.00000  21.51743   23.50  17.59735    29.6 20.45238 32.16667 19.34339
# 3       a             1    22  -4      -6      46    NA    34.5 16.263456  24.00000  21.51743   23.50  17.59735    29.6 20.45238 32.16667 19.34339
# 4       a             1     3  -3      -6      46    NA    34.5 16.263456  24.00000  21.51743   23.50  17.59735    29.6 20.45238 32.16667 19.34339
# 5       a             1    23  -2      -6      46    NA    34.5 16.263456  24.00000  21.51743   23.50  17.59735    29.6 20.45238 32.16667 19.34339
# 6       a             1    46  -1      -6      46    NA    34.5 16.263456  24.00000  21.51743   23.50  17.59735    29.6 20.45238 32.16667 19.34339
# 7       a             2    45  -5      -5      22    NA    28.0  8.485281  41.00000  23.30236   36.25  21.26617    38.0 18.82817       NA       NA
# 8       a             2    22  -4      -5      22    NA    28.0  8.485281  41.00000  23.30236   36.25  21.26617    38.0 18.82817       NA       NA
# 9       a             2    67  -3      -5      22    NA    28.0  8.485281  41.00000  23.30236   36.25  21.26617    38.0 18.82817       NA       NA
# 10      a             2    34  -2      -5      22    NA    28.0  8.485281  41.00000  23.30236   36.25  21.26617    38.0 18.82817       NA       NA
# 11      a             2    22  -1      -5      22    NA    28.0  8.485281  41.00000  23.30236   36.25  21.26617    38.0 18.82817       NA       NA
# 12      b             3    34  -4      -4      54    NA    70.0 22.627417 272.33333 350.81667  212.75 310.24010      NA       NA       NA       NA
# 13      b             3   677  -3      -4      54    NA    70.0 22.627417 272.33333 350.81667  212.75 310.24010      NA       NA       NA       NA
# 14      b             3    86  -2      -4      54    NA    70.0 22.627417 272.33333 350.81667  212.75 310.24010      NA       NA       NA       NA
# 15      b             3    54  -1      -4      54    NA    70.0 22.627417 272.33333 350.81667  212.75 310.24010      NA       NA       NA       NA
# 16      b             4     4  -3      -3      98    NA    77.0 29.698485  52.66667  47.08857      NA        NA      NA       NA       NA       NA
# 17      b             4    56  -2      -3      98    NA    77.0 29.698485  52.66667  47.08857      NA        NA      NA       NA       NA       NA
# 18      b             4    98  -1      -3      98    NA    77.0 29.698485  52.66667  47.08857      NA        NA      NA       NA       NA       NA
# 19      c             5    23  -6      -6      23    NA    50.0 38.183766  40.66667  31.46956      NA        NA    44.0 26.54556 39.80000 24.83345
# 20      c             5    54  -5      -6      23    NA    50.0 38.183766  40.66667  31.46956      NA        NA    44.0 26.54556 39.80000 24.83345
# 21      c             5    22  -3      -6      23    NA    50.0 38.183766  40.66667  31.46956      NA        NA    44.0 26.54556 39.80000 24.83345
# 22      c             5    77  -2      -6      23    NA    50.0 38.183766  40.66667  31.46956      NA        NA    44.0 26.54556 39.80000 24.83345
# 23      c             5    23  -1      -6      23    NA    50.0 38.183766  40.66667  31.46956      NA        NA    44.0 26.54556 39.80000 24.83345

Upvotes: 1

arg0naut91
arg0naut91

Reputation: 14764

A convenience data.table / sapply function (split in 2 parts for better readability):

add_mean_sd <- function(df, group_var = c("Animal", "transactionID"), day_var = "Day", count_var = "Count", window_var = 2) {

  require(data.table)

  # Calculate mean for the desired window

  df <- setDT(df)[, paste("mean", window_var, "d", sep = "") := ifelse(last(sapply(get(day_var), function(x) length(get(count_var)[between(get(day_var), x - window_var, x)]))) < window_var |
                                                                         any(sapply(get(day_var), function(x) !all(abs(diff(get(day_var)[between(get(day_var), x - window_var + 1, x)])) == 1))), NA_real_, 
                                                                       last(sapply(get(day_var), function(x) round(mean(get(count_var)[between(get(day_var), x - window_var + 1, x)]),3)))), by = mget(group_var)]

  # Calculate sd for the desired window

  df <- df[, paste("sd", window_var, "d", sep = "") := ifelse(last(sapply(get(day_var), function(x) length(get(count_var)[between(get(day_var), x - window_var, x)]))) < window_var |
                                                                any(sapply(get(day_var), function(x) !all(abs(diff(get(day_var)[between(get(day_var), x - window_var + 1, x)])) == 1))), NA_real_, 
                                                              last(sapply(get(day_var), function(x) round(sd(get(count_var)[between(get(day_var), x - window_var + 1, x)]),3)))), by = mget(group_var)]

  return(df)

}

Which you can use either alone, within magrittr pipeline, or a simple loop if you have many desired windows:

# Alone

df <- add_mean_sd(df) # I've set window 2 as default so no need to specify
df <- add_mean_sd(df, window_var = 3) # etc..

# Magrittr

library(magrittr)

df <- add_mean_sd(df) %>% 
  add_mean_sd(window_var = 3) %>%
  add_mean_sd(window_var = 4) %>%
  add_mean_sd(window_var = 5) %>%
  add_mean_sd(window_var = 6)

# A simple loop (will create columns for all windows at once)

for (i in 2:6) { df <- add_mean_sd(df, window_var = i) }

Output:

df[]

    Animal Count Day transactionID mean2d   sd2d  mean3d    sd3d mean4d    sd4d mean5d   sd5d mean6d   sd6d
 1:      a    45  -6             1   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452 32.167 19.343
 2:      a    54  -5             1   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452 32.167 19.343
 3:      a    22  -4             1   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452 32.167 19.343
 4:      a     3  -3             1   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452 32.167 19.343
 5:      a    23  -2             1   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452 32.167 19.343
 6:      a    46  -1             1   34.5 16.263  24.000  21.517  23.50  17.597   29.6 20.452 32.167 19.343
 7:      a    45  -5             2   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828     NA     NA
 8:      a    22  -4             2   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828     NA     NA
 9:      a    67  -3             2   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828     NA     NA
10:      a    34  -2             2   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828     NA     NA
11:      a    22  -1             2   28.0  8.485  41.000  23.302  36.25  21.266   38.0 18.828     NA     NA
12:      b    34  -4             3   70.0 22.627 272.333 350.817 212.75 310.240     NA     NA     NA     NA
13:      b   677  -3             3   70.0 22.627 272.333 350.817 212.75 310.240     NA     NA     NA     NA
14:      b    86  -2             3   70.0 22.627 272.333 350.817 212.75 310.240     NA     NA     NA     NA
15:      b    54  -1             3   70.0 22.627 272.333 350.817 212.75 310.240     NA     NA     NA     NA
16:      b     4  -3             4   77.0 29.698  52.667  47.089     NA      NA     NA     NA     NA     NA
17:      b    56  -2             4   77.0 29.698  52.667  47.089     NA      NA     NA     NA     NA     NA
18:      b    98  -1             4   77.0 29.698  52.667  47.089     NA      NA     NA     NA     NA     NA
19:      c    23  -6             5   50.0 38.184      NA      NA     NA      NA     NA     NA     NA     NA
20:      c    54  -5             5   50.0 38.184      NA      NA     NA      NA     NA     NA     NA     NA
21:      c    22  -3             5   50.0 38.184      NA      NA     NA      NA     NA     NA     NA     NA
22:      c    77  -2             5   50.0 38.184      NA      NA     NA      NA     NA     NA     NA     NA
23:      c    23  -1             5   50.0 38.184      NA      NA     NA      NA     NA     NA     NA     NA

If you'd like to change the order (e.g. go from first to last), you can change last to first while also changing the parameters of between argument (inverting them and changing the sign).

This means that the mean of the window of your desired size will be taken from first value onwards as expected.

Example:

add_mean_sd <- function(df, group_var = c("Animal", "transactionID"), day_var = "Day", count_var = "Count", window_var = 2) {

  require(data.table)

  # Calculate mean for the desired window

  df <- setDT(df)[, paste("mean", window_var, "d", sep = "") := ifelse(first(sapply(get(day_var), function(x) length(get(count_var)[between(get(day_var), x, x + window_var)]))) < window_var |
                                                                         any(sapply(get(day_var), function(x) !all(abs(diff(get(day_var)[between(get(day_var), x, x + window_var - 1)])) == 1))), NA_real_, 
                                                                       first(sapply(get(day_var), function(x) round(mean(get(count_var)[between(get(day_var), x, x + window_var - 1)]),3)))), by = mget(group_var)]

  # Calculate sd for the desired window

  df <- df[, paste("sd", window_var, "d", sep = "") := ifelse(first(sapply(get(day_var), function(x) length(get(count_var)[between(get(day_var), x, x + window_var)]))) < window_var |
                                                                any(sapply(get(day_var), function(x) !all(abs(diff(get(day_var)[between(get(day_var), x, x + window_var - 1)])) == 1))), NA_real_, 
                                                              first(sapply(get(day_var), function(x) round(sd(get(count_var)[between(get(day_var), x, x + window_var - 1)]),3)))), by = mget(group_var)]

  return(df)

}

Upvotes: 2

Related Questions