Reputation: 591
Here is a reproducible data set. The problem is to find 1 or 2 consecutive non-NA values in between a series of NA and assign them as NA. If there are more than 2, its fine nothing needs to be done.
set.seed(55)
data <- rnorm(10)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + 0:9*60
R <- xts(x = data, order.by = dates)
colnames(R) <- "R-factor"
R[c(1, 3, 6, 10)] <- NA
R
Output:
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 -1.812376850
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 -1.119221005
2019-03-18 10:34:00 0.001908206
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.505343855
2019-03-18 10:37:00 -0.099234393
2019-03-18 10:38:00 0.305353199
2019-03-18 10:39:00 NA
Expected result:
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.505343855
2019-03-18 10:37:00 -0.099234393
2019-03-18 10:38:00 0.305353199
2019-03-18 10:39:00 NA
I have written a function with for-loop which works fine for a small dataset but it's extremely slow. Original data consists of 100,000+ data points and this function couldn't execute it after more than 10 minutes
Can anyone kindly help me to avoid the loop to make it faster?
Upvotes: 1
Views: 632
Reputation: 263362
This is probably faster than most of the other solutions offered. The rep
function is essentially the inverse of the rle
function. It takes two vector arguments and expands the count of the values of the first by the lengths in the second and this allows a test based on the length of runs and then replacement with is.na <-
. There are actually two different functions: rle(x) which returns a logical vector of length(x) and then there is is.na(x)<-
which makes an assignment of NA to items in x depending on the logical values in the vector to the right of that function.:
rleR <- rle(c(is.na(R[,1]))) #get the position and lengths of nonNA's and NA's
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) ) #set NAs
#--------------
> R
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.50534386
2019-03-18 10:37:00 -0.09923439
2019-03-18 10:38:00 0.30535320
2019-03-18 10:39:00 NA
Warning message:
timezone of object (CET) is different than current timezone ().
microbenchmark(
Fill = {Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
ave(R, cumsum(is.na(R)), FUN = Fillin)},
RLrep = {rleR <- rle(c(is.na(R[,1])))
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) )})
#----------------------
Unit: microseconds
expr min lq mean median uq max neval cld
Fill 1668.788 1784.6275 1942.5261 1844.5825 2005.0960 4911.762 100 b
RLrep 102.174 113.9565 144.3477 131.4735 156.6715 368.665 100 a
Upvotes: 1
Reputation: 269654
Create a function Fillin
which returns NA if the length is less than or equal to 3 (or 2 if the first element is not NA so that we can handle the first group even if it does not start with an NA) and returns its argument otherwise. Use cumsum
to group the runs and apply Fillin
to each group.
Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
Rc <- coredata(R)
R[] <- ave(Rc, cumsum(is.na(Rc)), FUN = Fillin)
giving:
> R
R-factor
2019-03-18 10:30:00 NA
2019-03-18 10:31:00 NA
2019-03-18 10:32:00 NA
2019-03-18 10:33:00 NA
2019-03-18 10:34:00 NA
2019-03-18 10:35:00 NA
2019-03-18 10:36:00 -0.50534386
2019-03-18 10:37:00 -0.09923439
2019-03-18 10:38:00 0.30535320
2019-03-18 10:39:00 NA
This solution runs at about the same speed as the one using rle
.
library(microbenchmark)
microbenchmark(
Fill = { Fillin <- function(x) if (length(x) <= 3 - !is.na(x[1])) NA else x
Rc <- coredata(R)
R[] <- ave(Rc, cumsum(is.na(Rc)), FUN = Fillin)
},
RLrep = { rleR <- rle(c(is.na(R[,1])))
is.na(R) <- with(rleR, rep(lengths < 3 , lengths ) )
}
)
giving:
Unit: microseconds
expr min lq mean median uq max neval cld
Fill 490.9 509.5 626.550 527.7 596.45 3411.1 100 a
RLrep 523.5 540.8 604.061 550.8 592.00 1244.4 100 a
Upvotes: 4
Reputation: 4150
Maybe try this based on Distance from the closest non NA value in a dataframe
library(tidyverse)
set.seed(55)
x <- 100000
data <- rnorm(x)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + (seq_len(x))*60
time_table1 <- tibble(time = dates,data = data)
time_table <- time_table1 %>%
mutate(random = rnorm(x),
new = if_else(random > data,NA_real_,data)) %>%
select(-data,-random) %>%
rename(data= new)
lengths_na <- time_table$data %>% is.na %>% rle %>% pluck('lengths')
the_operation <- . %>%
mutate(lengths_na =lengths_na %>% seq_along %>% rep(lengths_na)) %>%
group_by(lengths_na) %>%
add_tally() %>%
ungroup() %>%
mutate(replace_sequence = if_else(condition = n < 3,true = NA_real_,false = data))
microbenchmark::microbenchmark(time_table %>% the_operation)
The results are quite good
Unit: milliseconds
expr min lq mean median uq max neval
time_table %>% the_operation 141.9009 176.2988 203.3744 190.183 214.1691 412.3161 100
Maybe this is simpler to read
library(tidyverse)
set.seed(55)
# Create the data
x <- 100
data <- rnorm(x)
dates <- as.POSIXct("2019-03-18 10:30:00", tz = "CET") + (seq_len(x))*60
time_table1 <- tibble(time = dates,data = data)
# Fake some na's
time_table <- time_table1 %>%
mutate(random = rnorm(x),
new = if_else(random > data,NA_real_,data)) %>%
select(-data,-random) %>%
rename(data= new)
# The rle function counts the occurrences of the same value in a vector,
# We create a T/F vector using is.na function
# meaning that we can count the lenght of sequences with or without na's
lengths_na <- time_table$data %>% is.na %>% rle %>% pluck('lengths')
# This operation here can be done outside of the df
new_col <- lengths_na %>%
seq_along %>% # Counts to the size of this vector
rep(lengths_na) # Reps the lengths of the sequences populating the vector
result <- time_table %>%
mutate(new_col =new_col) %>%
group_by(new_col) %>% # Operates the logic on this group look into the tidyverse
add_tally() %>% # Counts how many instance there are on each group
ungroup() %>% # Not actually needed but good manners
mutate(replace_sequence = if_else(condition = n < 3,true = NA_real_,false = data))
Upvotes: 2
Reputation: 1381
I guess, there are more elegant solutions around, but this cuts the time in half
R_df=as.data.frame(R)
R_df$shift_1=c(R_df$`R-factor`[-1],NA) #shift value one up
R_df$shift_2=c(NA,R_df$`R-factor`[-nrow(R_df)]) #shift value one down
# create new filtered variable
R_df$`R-factor_new`=ifelse(is.na(R_df$`R-factor`),NA,
ifelse((!is.na(R_df$shift_1))|(!is.na(R_df$shift_2)),
R_df$`R-factor`,NA)
> test replications elapsed relative user.self sys.self user.child sys.child
> 2 ifelseapproach 1000 0.83 1.000 0.65 0.19 NA NA
> 1 original 1000 1.81 2.181 1.76 0.01 NA NA
Upvotes: 1