Reputation: 57
thanks for reviewing my questions
I am trying to create a new variable, that will get value from another variable if certain conditions are met; otherwise, take the value of the previous observation.
I can do it by running a loop like this:
data <- mtcars
data$test <- NA
data$test <- as.numeric(data$test)
a0 <- Sys.time()
for (i in 2:nrow(data)) {
ifelse(data$carb[[i]] < 4,
data$test[[i]] <- data[[i-1,'test']],
data$test[[i]] <- data[[i,'mpg']]
)
a1 <- Sys.time()
per_left <- (i)/nrow(data)
print(paste("Time left is", round(as.numeric(as.difftime(a1-a0, units = "mins"))/per_left,2),"mins"))
}
However, my data set more than 8million observations. I feel like this is not the optimal way to save time.
***For function Lag: Somehow it seems like if I use lag, the lag function will use the data of the previously recorded, not updated one.
For example.
df1 <- data.frame(ID = c(1, 1, 1, 1, 4, 5),
condition = c(FALSE,TRUE,TRUE,TRUE, TRUE, FALSE),
var1 = c('a', 'b', 'c', 'd', 'f', 'e'))
df2 <- df1 %>%
mutate(
new_2 = '0',
new_2 = case_when(
ID == lag(ID) & condition == TRUE ~ lag(new_2),
TRUE ~ var1
))
> df2
ID condition var1 new_2
1 1 FALSE a a
2 1 TRUE b 0
3 1 TRUE c 0
4 1 TRUE d 0
5 4 TRUE f f
6 5 FALSE e e
It should be
ID condition var1 new_2
1 1 FALSE a a
2 1 TRUE b a
3 1 TRUE c a
4 1 TRUE d a
5 4 TRUE f f
6 5 FALSE e e
I run the above function, row 2 should take the previous value - a, not the default value of 0. While I if I go with the for loop, it will take the "a".
Is there are function that does it? Or how should I update my function to make it faster?
Please advise. Thank you!
Upvotes: 1
Views: 679
Reputation: 887028
We can use lag
and ifelse
is already vectorized. So, either ifelse
or case_when
can be used. But, case_when
would be more generalizable when there are multiple conditions
library(dplyr)
out <- data %>%
mutate(new = case_when(carb < 4 ~ lag(test), TRUE ~ mpg))
To make this faster, another option is shift
from data.table
library(data.table)
setDT(data)[, new := fifelse(carb < 4, shift(test), mpg)]
For the second dataset, perhaps
library(dplyr)
df1 %>%
mutate(new_2 = replace(var1, condition, lag(var1[condition])))
-output
# ID condition var1 new_2
#1 1 FALSE a a
#2 1 TRUE b <NA>
#3 1 TRUE c b
#4 1 TRUE d c
#5 4 TRUE f d
#6 5 FALSE e e
Or it could be
df1 %>%
group_by(ID) %>%
mutate(new_2 = case_when(condition ~ lag(var1), TRUE ~ '0'))
# A tibble: 6 x 4
# Groups: ID [3]
# ID condition var1 new_2
# <dbl> <lgl> <chr> <chr>
#1 1 FALSE a 0
#2 1 TRUE b a
#3 1 TRUE c b
#4 1 TRUE d c
#5 4 TRUE f <NA>
or using data.table
setDT(df1)[condition, new_2 := shift(var1)]
Based on the updated expected output
df1 %>%
group_by(ID) %>%
mutate(new_2 = lag(var1)) %>%
group_by(grp = rleid(condition), .add = TRUE) %>%
mutate(new_2 = coalesce(first(new_2), var1)) %>%
ungroup %>%
dplyr::select(-grp)
# A tibble: 6 x 4
# ID condition var1 new_2
# <dbl> <lgl> <chr> <chr>
#1 1 FALSE a a
#2 1 TRUE b a
#3 1 TRUE c a
#4 1 TRUE d a
#5 4 TRUE f f
#6 5 FALSE e e
Upvotes: 3
Reputation: 4841
One option in base R is to replace the ifelse
with if() ... else
. However, a much faster solution using base R is to use a combination of ifelse
and Reduce
. This is 3.46 / .044 ~ 78 times faster than the OP's solution. The solution is:
v2 <- mtcars
v2$test <- ifelse(v2$carb < 4, NA_real_, v2$mpg)
v2$test <- Reduce(
function(xprev, xnew)
if(is.na(xnew)) xprev else xnew,
v2$test, accumulate = TRUE, init = v2$mpg[1])[-1]
Here is a comparison with some alternatives:
# works even if the first entry does not comply with the condition
mtcars$carb[1] <- 1
# essentially the OPs solution
v0 <- mtcars
v0$test <- v0$mpg
for (i in 2:nrow(v0))
ifelse(v0$carb[[i]] < 4,
v0$test[[i]] <- v0[[i-1,'test']],
v0$test[[i]] <- v0[[i,'mpg']])
# using if ... else instead of ifelse
v1 <- mtcars
v1$test <- v1$mpg
for (i in 2:nrow(v1))
v1$test[i] <- if(v1$carb[i] < 4) v1$test[i - 1] else v1$test[i]
# we get the same
all.equal(v0, v1)
#R> [1] TRUE
# using ifelse + Reduce
v2 <- mtcars
v2$test <- ifelse(v2$carb < 4, NA_real_, v2$mpg)
v2$test <- Reduce(
function(xprev, xnew)
if(is.na(xnew)) xprev else xnew,
v2$test, accumulate = TRUE, init = v2$mpg[1])[-1]
# we get the same
all.equal(v0, v2)
#R> [1] TRUE
# compare the computation time
bench::mark(
`ifelse` = {
v0 <- mtcars
v0$test <- v0$mpg
for (i in 2:nrow(v0))
ifelse(v0$carb[[i]] < 4,
v0$test[[i]] <- v0[[i-1,'test']],
v0$test[[i]] <- v0[[i,'mpg']])
},
`if ... else` = {
v1 <- mtcars
v1$test <- v1$mpg
for (i in 2:nrow(v1))
v1$test[i] <- if(v1$carb[i] < 4) v1$test[i - 1] else v1$test[i]
},
`ifelse + reduce` = {
v2 <- mtcars
v2$test <- ifelse(v2$carb < 4, NA_real_, v2$mpg)
v2$test <- Reduce(
function(xprev, xnew)
if(is.na(xnew)) xprev else xnew,
v2$test, accumulate = TRUE, init = v2$mpg[1])[-1]
}, min_time = 1, check = FALSE)
#R> # A tibble: 3 x 13
#R> expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
#R> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
#R> 1 ifelse 3.27ms 3.46ms 285. 50.05KB 16.8 254 15 891ms
#R> 2 if ... else 2.72ms 2.9ms 341. 48.7KB 17.9 304 16 892ms
#R> 3 ifelse + reduce 40.98µs 44.92µs 21803. 9.79KB 19.6 9991 9 458ms
However, my data set more than 8million observations. I feel like this is not the optimal way to save time.
The ifelse
and Reduce
solution runs in ~9 seconds on my computer with 8 million rows which I guess is manageable if it is only done once:
# simulate a large data set
set.seed(1)
n <- 8e6
dum_dat <- data.frame(var_1 = runif(n, 0, 8), var_2 = rnorm(n))
system.time({
dum_dat$test <- ifelse(dum_dat$var_1 < 4, NA_real_, dum_dat$var_2)
func <- compiler::cmpfun(
function(xprev, xnew)
if(is.na(xnew)) xprev else xnew)
dum_dat$test <- Reduce(
func, dum_dat$test, accumulate = TRUE, init = dum_dat$var_2[1])[-1]
})
#R> user system elapsed
#R> 8.816 0.064 8.882
Upvotes: 1
Reputation: 388862
You can do this in base R by getting the positions of the condition (data$carb < 4
) and get the index to replace by subtracting -1 to those index values.
data <- mtcars
data$test <- mtcars$mpg
inds <- which(data$carb < 4)
data$test[inds] <- data$test[inds - 1]
Many of the R functions are vectorised so you would not need an explicit for
loop for them.
Upvotes: 2
Reputation: 11584
we can just add lag() function to your ifelse condition:
data$test = ifelse(data$carb < 4, data$test <- lag(data$test), data$test <- data$mpg)
> data
mpg cyl disp hp drat wt qsec vs am gear carb test
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 21.0
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 21.0
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 21.0
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 21.0
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 22.8
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 21.4
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 14.3
Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 14.3
Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 14.3
Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 19.2
Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 17.8
Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 17.8
Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 17.8
Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 16.4
Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 10.4
Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 10.4
Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 14.7
Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 14.7
Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 14.7
Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 32.4
Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 30.4
Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 33.9
AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 21.5
Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 13.3
Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 13.3
Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 13.3
Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 19.2
Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 27.3
Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 15.8
Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 19.7
Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 15.0
Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 15.0
>
Upvotes: 0