Reputation: 4243
I have a dataset as follows as a sample. My actual dataset has 5000 columns:
# Define Adstock Rate
adstock_rate = 0.50
lag_number = 3
# Create Data
advertising = c(117.913, 120.112, 125.828, 115.354, 177.090, 141.647, 137.892, 0.000, 0.000, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 158.511, 109.385, 91.084, 79.253, 102.706,
78.494, 135.114, 114.549, 87.337, 107.829, 125.020, 82.956, 60.813, 83.149, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 129.515, 105.486, 111.494, 107.099, 0.000, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000,
134.913, 123.112, 178.828, 112.354, 100.090, 167.647, 177.892, 0.000, 0.000, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 112.511, 155.385, 123.084, 89.253, 67.706,
23.494, 122.114, 112.549, 65.337, 134.829, 123.020, 81.956, 23.813, 65.149, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 145.515, 154.486, 121.494, 117.099, 0.000, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
)
advertising2 = c(43.913, 231.112, 76.828, 22.354, 98.090, 123.647, 90.892, 0.000, 0.000, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 234.511, 143.385, 78.084, 89.253, 12.706,
34.494, 56.114, 78.549, 12.337, 67.829, 42.020, 90.956, 23.813, 83.149, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 52.515, 76.486, 89.494, 12.099, 0.000, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000,
67.913, 12.112, 45.828, 78.354, 89.090, 90.647, 23.892, 0.000, 0.000, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 78.511, 23.385, 43.084, 67.253, 33.706,
56.494, 78.114, 98.549, 45.337, 31.829, 67.020, 87.956, 94.813, 65.149, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 55.515, 32.486, 78.494, 33.099, 0.000, 0.000, 0.000,
0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000
)
Region = c(500, 500, 500, 500, 500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,
500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500,500, 500, 500, 500, 500, 500,
500, 500,
501, 501, 501, 501, 501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,
501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501,501, 501, 501, 501, 501, 501,
501, 501)
advertising_dataset<-data.frame(cbind(Region, advertising, advertising2))
My dataset looks like this:
head(advertising_dataset, 15)
Region advertising advertising2
1 500 117.913 43.913
2 500 120.112 231.112
3 500 125.828 76.828
4 500 115.354 22.354
5 500 177.090 98.090
6 500 141.647 123.647
7 500 137.892 90.892
8 500 0.000 0.000
9 500 0.000 0.000
10 500 0.000 0.000
11 500 0.000 0.000
12 500 0.000 0.000
13 500 0.000 0.000
14 500 0.000 0.000
15 500 0.000 0.000
A for-loop is then created to only 1 column and then a group_by function after that by Region
.
foo <- function(df_, lag_val = 1) {
df_$adstocked_advertising = df_$advertising
for (i in (1 + lag_val):nrow(df_)) {
df_$adstocked_advertising[i] = df_$advertising[i] + adstock_rate *
df_$adstocked_advertising[i - lag_val]
}
return(df_)
}
adv_2 <- data.frame(advertising_dataset %>%
group_by(Region) %>%
do(foo(data.frame(.), lag_val = 3)))
How do I apply the above functions including adv_2
to all columns from 2:ncol(advertising_dataset) rather than just the advertising
column?
My final number of columns should double in the end because a newly revised column will be created for every existing column.
I have a feeling it is something along these lines, with the function I above:
data.frame(advertising_dataset[1],
apply(advertising_dataset[2:ncol(advertising_dataset)],2, foo) )
Any help would be great, thanks!
Upvotes: 1
Views: 100
Reputation: 887991
We could use accumulate
with mutate_all
library(tidyverse)
out <- advertising_dataset %>%
group_by(Region) %>%
mutate_all(funs(adstocked = accumulate(., ~ .y + adstock_rate * .x)))
out
# A tibble: 104 x 5
# Groups: Region [2]
# Region advertising advertising2 advertising_adstocked advertising2_adstocked
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 500 118. 43.9 118. 43.9
# 2 500 120. 231. 179. 253.
# 3 500 126. 76.8 215. 203.
# 4 500 115. 22.4 223. 124.
# 5 500 177. 98.1 289. 160.
# 6 500 142. 124. 286. 204.
# 7 500 138. 90.9 281. 193.
# 8 500 0 0 140. 96.4
# 9 500 0 0 70.2 48.2
#10 500 0 0 35.1 24.1
# ... with 94 more rows
Checking with the output from OP's solution
head(out[[4]])
#[1] 117.9130 179.0685 215.3623 223.0351 288.6076 285.9508
head(adv_2[[4]])
#[1] 117.9130 179.0685 215.3623 223.0351 288.6076 285.9508
We could modify the OP's function foo
for different lag_val
foo1 <- function(dot, lag_val = 1) {
tmp <- dot
for(i in (1 + lag_val): length(tmp)) {
tmp[i] <- tmp[i] + adstock_rate * tmp[i - lag_val]
}
return(tmp)
}
advertising_dataset %>%
group_by(Region) %>%
mutate_all(funs(adstocked = foo1(., lag_val = 1)))
# A tibble: 104 x 5
# Groups: Region [2]
# Region advertising advertising2 advertising_adstocked advertising2_adstocked
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 500 118. 43.9 118. 43.9
# 2 500 120. 231. 179. 253.
# 3 500 126. 76.8 215. 203.
# 4 500 115. 22.4 223. 124.
# 5 500 177. 98.1 289. 160.
# 6 500 142. 124. 286. 204.
# 7 500 138. 90.9 281. 193.
# 8 500 0 0 140. 96.4
# 9 500 0 0 70.2 48.2
#10 500 0 0 35.1 24.1
# ... with 94 more rows
-change the lag_val
advertising_dataset %>%
group_by(Region) %>%
mutate_all(funs(adstocked = foo1(., lag_val = 2)))
# A tibble: 104 x 5
# Groups: Region [2]
# Region advertising advertising2 advertising_adstocked advertising2_adstocked
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 500 118. 43.9 118. 43.9
# 2 500 120. 231. 120. 231.
# 3 500 126. 76.8 185. 98.8
# 4 500 115. 22.4 175. 138.
# 5 500 177. 98.1 269. 147.
# 6 500 142. 124. 229. 193.
# 7 500 138. 90.9 273. 165.
# 8 500 0 0 115. 96.3
# 9 500 0 0 136. 82.3
#10 500 0 0 57.3 48.2
# ... with 94 more rows
Upvotes: 2
Reputation: 1321
Using data.table, you can apply any function to a subset of columns using an lapply within the j-statement. These functions will act on the columns as vectors, however, so you might want to edit your foo()
to be general to any vector. But, basically, all you have to do is:
library(data.table)
setDT(advertising_dataset)
cnames <- colnames(advertising_dataset)
advertising_dataset[, lapply(.SD, foo), .SDcols = cnames[cnames != "Region]]
Upvotes: 0