Reputation: 5907
I am working with the R programming language. I have the following data:
library(dplyr)
my_data = data.frame(id = c(1,1,1,1,2,2,2,3,4,4,5,5,5,5,5), var_1 = sample(c(0,1), 15, replace = TRUE) , var_2 =sample(c(0,1), 15 , replace = TRUE) )
my_data = data.frame(my_data %>% group_by(id) %>% mutate(index = row_number(id)))
my_data = my_data[,c(1,4,2,3)]
The data looks something like this:
id index var_1 var_2
1 1 1 0 1
2 1 2 0 0
3 1 3 1 1
4 1 4 0 1
5 2 1 1 0
6 2 2 1 1
7 2 3 0 1
8 3 1 1 0
9 4 1 0 0
10 4 2 0 0
11 5 1 0 0
12 5 2 1 0
13 5 3 0 1
14 5 4 0 0
15 5 5 0 1
I want to create two new variables (v_1, v_2). For each unique "id":
v_1: I want v_1 to be the average value of the current, previous and previous-to-previous values of var_1 (i.e. index = n, index = n-1 and index = n-2). When this is not possible (e.g. for index = 2 and index = 1), I want this average to be for as back as you can go.
v_2: I want v_2 to be the average value of the current, previous and previous-to-previous values of var_2 (i.e. index = n, index = n-1 and index = n-2). When this is not possible (e.g. for index = 2 and index = 1), I want this average to be for as back as you can go.
This would be something like this:
I tried to do this with the following code:
average_data = my_data %>%
group_by(id) %>%
summarise(v_1 = mean(tail(var_1, 3)),
v_2 = mean(tail(var_2, 3)))
# final_result
final_data = merge(x = my_data, y = average_data, by = "id", all.x = TRUE)
But I am not sure if this is correct.
Can someone please show me how to do this?
Thanks!
Upvotes: 6
Views: 359
Reputation: 7540
This uses dplyr's across
with slider's slide_dbl
; both from the tidyverse. Slider handles partial windows, so is well-suited to this problem.
(%>%
may be used instead of the native pipe |>
.)
library(dplyr)
library(slider)
# Sample Data
df <- data.frame(
id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
index = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L, 1L, 2L, 3L, 4L, 5L),
var_1 = c(0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L),
var_2 = c(1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L)
)
# Possible answer
df |>
group_by(id) |>
mutate(across(starts_with("var_"), ~ slide_dbl(., mean, .before = 2), .names = "{.col}_mean")) |>
ungroup()
#> # A tibble: 15 × 6
#> id index var_1 var_2 var_1_mean var_2_mean
#> <int> <int> <int> <int> <dbl> <dbl>
#> 1 1 1 0 1 0 1
#> 2 1 2 0 0 0 0.5
#> 3 1 3 1 1 0.333 0.667
#> 4 1 4 0 1 0.333 0.667
#> 5 2 1 1 0 1 0
#> 6 2 2 1 1 1 0.5
#> 7 2 3 0 1 0.667 0.667
#> 8 3 1 1 0 1 0
#> 9 4 1 0 0 0 0
#> 10 4 2 0 0 0 0
#> 11 5 1 0 0 0 0
#> 12 5 2 1 0 0.5 0
#> 13 5 3 0 1 0.333 0.333
#> 14 5 4 0 0 0.333 0.333
#> 15 5 5 0 1 0 0.667
Created on 2022-06-12 by the reprex package (v2.0.1)
Upvotes: 0
Reputation: 3212
Here is a solution using only built in functions and dplyr
:
my_data %>%
mutate(
row = seq_along(id),
v_1 = (var_1 + lag(var_1, default = 0) + lag(var_1, 2, default = 0))/pmin(row, 3),
v_2 = (var_2 + lag(var_2, default = 0) + lag(var_2, 2, default = 0))/pmin(row, 3)
)
#> id index var_1 var_2 row v_1 v_2
#> 1 1 1 0 1 1 0.0000000 1.0000000
#> 2 1 2 1 0 2 0.5000000 0.5000000
#> 3 1 3 1 1 3 0.6666667 0.6666667
#> 4 1 4 1 0 4 1.0000000 0.3333333
#> 5 2 1 0 1 5 0.6666667 0.6666667
#> 6 2 2 0 1 6 0.3333333 0.6666667
#> 7 2 3 1 1 7 0.3333333 1.0000000
#> 8 3 1 1 1 8 0.6666667 1.0000000
#> 9 4 1 1 1 9 1.0000000 1.0000000
#> 10 4 2 1 1 10 1.0000000 1.0000000
#> 11 5 1 0 1 11 0.6666667 1.0000000
#> 12 5 2 0 1 12 0.3333333 1.0000000
#> 13 5 3 0 0 13 0.0000000 0.6666667
#> 14 5 4 0 0 14 0.0000000 0.3333333
#> 15 5 5 1 0 15 0.3333333 0.0000000
Created on 2022-06-09 by the reprex package (v2.0.1)
dplyr::lag()
gives you the previuous values of your variable. If they don't exist, we swap them for 0
, basically ignoring them. To get the average we divide on pmax(seq_along(<any variable>, 3))
, which will be 1
for the first row, 2
for the second, and 3
for all other rows.
This will also work on a grouped dataframe.
Upvotes: 2
Reputation: 4425
Here is a try with a simple function avg
to return this type of average
library(dplyr , warn.conflicts = FALSE)
set.seed(1978)
my_data = data.frame(id = c(1,1,1,1,2,2,2,3,4,4,5,5,5,5,5), var_1 = sample(c(0,1), 15, replace = TRUE) , var_2 =sample(c(0,1), 15 , replace = TRUE) )
my_data = data.frame(my_data %>% group_by(id) %>% mutate(index = row_number(id)))
my_data = my_data[,c(1,4,2,3)]
#===================================
avg <- function(x){
t <- rep(c(T,NA) , c(3 , length(x) - 1))
m <- numeric(length(x))
for(i in 1:length(x)){
m [i]<- mean(x[t[3:length(t)]] , na.rm = TRUE)
t <- lag(t)
}
m
}
#===================================
library(tidyverse)
my_data %>%
group_by(id) %>%
mutate(v_1 = avg(var_1), v_2 = avg(var_2))
#> # A tibble: 15 × 6
#> # Groups: id [5]
#> id index var_1 var_2 v_1 v_2
#> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 1 0 0 0 0
#> 2 1 2 1 0 0.5 0
#> 3 1 3 1 0 0.667 0
#> 4 1 4 1 1 1 0.333
#> 5 2 1 0 1 0 1
#> 6 2 2 1 1 0.5 1
#> 7 2 3 0 0 0.333 0.667
#> 8 3 1 1 0 1 0
#> 9 4 1 1 1 1 1
#> 10 4 2 0 1 0.5 1
#> 11 5 1 1 1 1 1
#> 12 5 2 1 0 1 0.5
#> 13 5 3 0 1 0.667 0.667
#> 14 5 4 1 0 0.667 0.333
#> 15 5 5 1 0 0.667 0.333
Created on 2022-06-09 by the reprex package (v2.0.1)
Upvotes: 1
Reputation: 101678
I would say this is moving average, and it can be impemented by a function f
like below, using embed
(preferrable) or sapply
(less efficient, not recommanded), and run it group-wisely using ave
:
f <- function(v, n = 3) {
rowMeans(embed(c(rep(NA, n-1), v), n), na.rm = TRUE)
}
or
f <- function(v, n = 3) {
sapply(
seq_along(v),
function(k) sum(v[pmax(k - n + 1, 1):k]) / pmin(k, n)
)
}
and then we run
transform(
df,
v1 = ave(var_1, id, FUN = f),
v2 = ave(var_2, id, FUN = f)
)
such that
id index var_1 var_2 v1 v2
1 1 1 0 1 0.0000000 1.0000000
2 1 2 0 0 0.0000000 0.5000000
3 1 3 1 1 0.3333333 0.6666667
4 1 4 0 1 0.3333333 0.6666667
5 2 1 1 0 1.0000000 0.0000000
6 2 2 1 1 1.0000000 0.5000000
7 2 3 0 1 0.6666667 0.6666667
8 3 1 1 0 1.0000000 0.0000000
9 4 1 0 0 0.0000000 0.0000000
10 4 2 0 0 0.0000000 0.0000000
11 5 1 0 0 0.0000000 0.0000000
12 5 2 1 0 0.5000000 0.0000000
13 5 3 0 1 0.3333333 0.3333333
14 5 4 0 0 0.3333333 0.3333333
15 5 5 0 1 0.0000000 0.6666667
Upvotes: 3
Reputation: 39667
You can use filter
(which is hidden when loading dplyr
) or convolve
and ave
for grouping.
fun <- function(x) {
. <- if(length(x) > 2) stats::filter(x, c(1,1,1)/3, side=1)[-2:-1] else NULL
#. <- if(length(x) > 2) convolve(x, c(1,1,1)/3, , type = "filter") else NULL #Alternative
c(cummean(x[1:min(2, length(x))]), .)
}
my_data$v_1 <- ave(my_data$var_1, my_data$id, FUN=fun)
my_data$v_2 <- ave(my_data$var_2, my_data$id, FUN=fun)
my_data
# id index var_1 var_2 v_1 v_2
#1 1 1 1 1 1.0000000 1.0000000
#2 1 2 1 1 1.0000000 1.0000000
#3 1 3 0 1 0.6666667 1.0000000
#4 1 4 1 1 0.6666667 1.0000000
#5 2 1 0 1 0.0000000 1.0000000
#6 2 2 0 0 0.0000000 0.5000000
#7 2 3 1 0 0.3333333 0.3333333
#8 3 1 0 0 0.0000000 0.0000000
#9 4 1 0 1 0.0000000 1.0000000
#10 4 2 0 0 0.0000000 0.5000000
#11 5 1 1 0 1.0000000 0.0000000
#12 5 2 0 1 0.5000000 0.5000000
#13 5 3 0 0 0.3333333 0.3333333
#14 5 4 1 0 0.3333333 0.3333333
#15 5 5 0 1 0.3333333 0.3333333
Or using cumsum
:
fun2 <- function(x, n=3) {
(cumsum(x) - head(cumsum(c(rep(0, n), x)), -n)) / pmin(n, seq_along(x)) }
my_data$v_1 <- ave(my_data$var_1, my_data$id, FUN=fun2)
my_data$v_2 <- ave(my_data$var_2, my_data$id, FUN=fun2)
Upvotes: 1
Reputation: 8880
data
df <- data.frame(
id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 3L, 4L, 4L, 5L, 5L, 5L, 5L, 5L),
index = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 1L, 1L, 2L, 1L, 2L, 3L, 4L, 5L),
var_1 = c(0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L),
var_2 = c(1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L)
)
tidyverse
library(tidyverse)
df %>%
group_by(id) %>%
mutate(across(starts_with("var_"),
.fns = ~zoo::rollapply(data = .x, width = 3, FUN = mean, partial = TRUE, align = "right"),
.names = "new_{.col}")) %>%
ungroup()
#> # A tibble: 15 × 6
#> id index var_1 var_2 new_var_1 new_var_2
#> <int> <int> <int> <int> <dbl> <dbl>
#> 1 1 1 0 1 0 1
#> 2 1 2 0 0 0 0.5
#> 3 1 3 1 1 0.333 0.667
#> 4 1 4 0 1 0.333 0.667
#> 5 2 1 1 0 1 0
#> 6 2 2 1 1 1 0.5
#> 7 2 3 0 1 0.667 0.667
#> 8 3 1 1 0 1 0
#> 9 4 1 0 0 0 0
#> 10 4 2 0 0 0 0
#> 11 5 1 0 0 0 0
#> 12 5 2 1 0 0.5 0
#> 13 5 3 0 1 0.333 0.333
#> 14 5 4 0 0 0.333 0.333
#> 15 5 5 0 1 0 0.667
Created on 2022-06-06 by the reprex package (v2.0.1)
data.table
library(data.table)
COLS <- gsub("ar", "", grep("var_", names(df), value = TRUE))
setDT(df)[,
(COLS) := lapply(.SD, function(x) zoo::rollapply(data = x, width = 3, FUN = mean, partial = TRUE, align = "right")),
by = id,
.SDcols = patterns("var_")][]
#> id index var_1 var_2 v_1 v_2
#> 1: 1 1 0 1 0.0000000 1.0000000
#> 2: 1 2 0 0 0.0000000 0.5000000
#> 3: 1 3 1 1 0.3333333 0.6666667
#> 4: 1 4 0 1 0.3333333 0.6666667
#> 5: 2 1 1 0 1.0000000 0.0000000
#> 6: 2 2 1 1 1.0000000 0.5000000
#> 7: 2 3 0 1 0.6666667 0.6666667
#> 8: 3 1 1 0 1.0000000 0.0000000
#> 9: 4 1 0 0 0.0000000 0.0000000
#> 10: 4 2 0 0 0.0000000 0.0000000
#> 11: 5 1 0 0 0.0000000 0.0000000
#> 12: 5 2 1 0 0.5000000 0.0000000
#> 13: 5 3 0 1 0.3333333 0.3333333
#> 14: 5 4 0 0 0.3333333 0.3333333
#> 15: 5 5 0 1 0.0000000 0.6666667
Created on 2022-06-06 by the reprex package (v2.0.1)
Upvotes: 5
Reputation: 79238
You could create a function that acomplishes this:
library(tidyverse)
fun <- function(x, k){
y <- cummean(first(x, k-1))
if(k > length(x)) y else c(y, zoo::rollmean(x, k))
}
df %>%
group_by(id) %>%
mutate(v_1 = fun(var_1, 3), v_2 = fun(var_2, 3))
# Groups: id [5]
id index var_1 var_2 v_1 v_2
<int> <int> <int> <int> <dbl> <dbl>
1 1 1 0 1 0 1
2 1 2 0 0 0 0.5
3 1 3 1 1 0.333 0.667
4 1 4 0 1 0.333 0.667
5 2 1 1 0 1 0
6 2 2 1 1 1 0.5
7 2 3 0 1 0.667 0.667
8 3 1 1 0 1 0
9 4 1 0 0 0 0
10 4 2 0 0 0 0
11 5 1 0 0 0 0
12 5 2 1 0 0.5 0
13 5 3 0 1 0.333 0.333
14 5 4 0 0 0.333 0.333
15 5 5 0 1 0 0.667
Upvotes: 2