Reputation: 1626
I have a data.table as follows -
data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26",
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26",
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN",
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN",
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17",
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN",
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN",
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L,
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))
I want to remove all the NaN
and Inf
values from this data.table.
date open high low close volume
1: 2021-11-24 NaN NaN NaN NaN 0
2: 2021-11-24 NaN NaN NaN NaN 0
3: 2021-11-26 0.43 0.43 0.43 0.43 2
4: 2021-11-24 0.17 0.17 0.17 0.17 10
5: 2021-11-26 0.19 0.19 0.19 0.19 75
6: 2021-11-24 0.15 0.15 0.15 0.15 1
7: 2021-11-24 NaN NaN NaN NaN 0
8: 2021-11-26 NaN NaN NaN NaN 0
9: 2021-11-26 NaN NaN NaN NaN 0
10: 2021-11-26 NaN NaN NaN NaN 0
11: 2021-11-26 NaN NaN NaN NaN 0
All the columns open
, high
, low
, close
are character types due to NaN
values.
Is there a quick way to remove NaNs directly in data.table
?
Here is the performance of each solution given below -
p_load(dtplyr, dplyr)
microbenchmark::microbenchmark(
user438383 = data[!unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])],
langtang = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5])),
akrun = {data <- type.convert(data, as.is = TRUE);
data[data[, Reduce(`&`, lapply(.SD, function(x)
!is.nan(x) & is.finite(x))), .SDcols = -1]]},
paul = {data <- type.convert(data, as.is = TRUE);
data[data[,is.finite(rowSums(.SD)), .SDcols=-1]]},
Macosso = {data$Row <- row.names(data);
rm_rw <- data[apply(data, 1,
function(X) any(X== "NaN"|X== "Inf")),] %>% dplyr::pull(Row);
data[!row.names(data) %in% rm_rw ,] %>% dplyr::select(-Row)}
)
Unit: microseconds
expr min lq mean median uq max neval cld
user438383 893.843 931.243 976.4554 974.011 1005.673 1093.929 100 a
langtang 2694.987 2779.411 2904.5124 2877.927 3003.832 3420.539 100 c
akrun 1664.476 1694.780 2253.8962 1731.392 1838.755 26035.268 100 b
paul 1663.552 1718.956 1792.2313 1770.511 1843.051 2151.975 100 b
Macosso 5899.961 6140.244 6429.9634 6368.072 6604.615 8180.782 100 d
Upvotes: 1
Views: 1338
Reputation: 6206
One way would be to find the index of the rows containing NaN
:
unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])
[1] 1 2 7 8 9 10 11
And then set a logical condition to remove these rows:
data[!unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])]
date open high low close volume
1: 2021-11-26 0.43 0.43 0.43 0.43 2
2: 2021-11-24 0.17 0.17 0.17 0.17 10
3: 2021-11-26 0.19 0.19 0.19 0.19 75
4: 2021-11-24 0.15 0.15 0.15 0.15 1
Some benchmarks
Unit: milliseconds
expr min lq mean median uq max neval cld
me 4.513141 5.545293 7.068744 6.707279 8.356170 31.30188 100 a
langtang 3.535727 3.646819 8.718629 6.318445 6.983275 59.76049 100 a
akrun 51.169168 195.102026 208.889413 204.564707 216.545022 274.02575 100 c
paul 11.235627 145.195062 146.721146 146.670909 148.432261 200.56718 100 b
Macosso 370.269687 448.143027 468.074160 457.499264 497.636319 553.70491 100 d
data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26",
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26",
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN",
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN",
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17",
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN",
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN",
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L,
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))
data = do.call("rbind", replicate(1000, data, simplify = FALSE))
library(dtplyr)
res = microbenchmark::microbenchmark(
me = data[!unique(which(data == NaN, arr.ind=T)[,1])],
langtang = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5])),
akrun = {data <- type.convert(data, as.is = TRUE);
data[data[, Reduce(`&`, lapply(.SD, function(x)
!is.nan(x) & is.finite(x))), .SDcols = -1]]},
paul = data %>%
lazy_dt %>%
filter(across(2:5, ~ .x != "NaN")) %>%
as.data.table,
Macosso = {data$Row <- row.names(data);
rm_rw <- data[apply(data, 1,
function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row);
data[!row.names(data) %in% rm_rw ,] %>% select(-Row)
}
)
Upvotes: 3
Reputation: 25323
A solution based on dtplyr
:
library(dtplyr)
library(dplyr)
library(data.table)
data <- structure(
list(date=c("2021-11-24","2021-11-24","2021-11-26",
"2021-11-24","2021-11-26","2021-11-24",
"2021-11-24","2021-11-26","2021-11-26",
"2021-11-26","2021-11-26"),
open=c("NaN","NaN","0.43","0.17","0.19","0.15",
"NaN","NaN","NaN","NaN","NaN"),
high=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN",
"NaN","NaN","NaN","NaN"),low=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),close=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),volume=c(0L,0L,2L,10L,75L,1L,0L,0L,0L,0L,0L)),row.names=c(NA,-11L),class=c("data.table","data.frame"))
data %>%
lazy_dt %>%
filter(across(2:5, ~ !.x %in% c("NaN","Inf"))) %>%
as.data.table
#> date open high low close volume
#> 1: 2021-11-26 0.43 0.43 0.43 0.43 2
#> 2: 2021-11-24 0.17 0.17 0.17 0.17 10
#> 3: 2021-11-26 0.19 0.19 0.19 0.19 75
#> 4: 2021-11-24 0.15 0.15 0.15 0.15 1
In case the NaN
and Inf
are not character type, as @akrun mentions in his answer, then the following is also a solution:
library(data.table)
# No need of this, if NaN e Inf are not strings
data <- type.convert(data, as.is = TRUE)
data[data[,is.finite(rowSums(.SD)), .SDcols=-1]]
#> date open high low close volume
#> 1: 2021-11-26 0.43 0.43 0.43 0.43 2
#> 2: 2021-11-24 0.17 0.17 0.17 0.17 10
#> 3: 2021-11-26 0.19 0.19 0.19 0.19 75
#> 4: 2021-11-24 0.15 0.15 0.15 0.15 1
Upvotes: 1
Reputation: 886938
The NaN
created was quoted, so the columns were unnecessarily type changed to character
.
> str(data)
Classes ‘data.table’ and 'data.frame': 11 obs. of 6 variables:
$ date : chr "2021-11-24" "2021-11-24" "2021-11-26" "2021-11-24" ...
$ open : chr "NaN" "NaN" "0.43" "0.17" ...
$ high : chr "NaN" "NaN" "0.43" "0.17" ...
$ low : chr "NaN" "NaN" "0.43" "0.17" ...
$ close : chr "NaN" "NaN" "0.43" "0.17" ...
$ volume: int 0 0 2 10 75 1 0 0 0 0 ...
We may need to convert the type automatically and then use data.table
methods - loop over the columns other than the 'date' by specifying the .SDcols
, create the logical expression i.e. the column values are not NaN (!is.nan
) and (&
) is finite (is.finite
), Reduce
the logical vectors to a single vector with &
and subset the rows
library(data.table)
data <- type.convert(data, as.is = TRUE)
out <- data[data[, Reduce(`&`, lapply(.SD, function(x)
!is.nan(x) & is.finite(x))), .SDcols = -1]]
out
date open high low close volume
1: 2021-11-26 0.43 0.43 0.43 0.43 2
2: 2021-11-24 0.17 0.17 0.17 0.17 10
3: 2021-11-26 0.19 0.19 0.19 0.19 75
4: 2021-11-24 0.15 0.15 0.15 0.15 1
Upvotes: 2
Reputation: 1439
The optimal strategy would be, to get the index of rows with NaN
then filter out those indexes.
library(dplyr)
data$Row <- row.names(data)
rm_rw <- data[apply(data, 1,
function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row)
data[!row.names(data) %in% rm_rw ,] %>% select(-Row)
date open high low close volume
1: 2021-11-26 0.43 0.43 0.43 0.43 2
2: 2021-11-24 0.17 0.17 0.17 0.17 10
3: 2021-11-26 0.19 0.19 0.19 0.19 75
4: 2021-11-24 0.15 0.15 0.15 0.15 1
Update 1
Changed any(X== "NaN"))
to any(X== "NaN"|X== "Inf"))
so that Inf
can also be filtered out
Upvotes: 2
Reputation: 24722
Could you just us as.numeric
to convert?
result = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5]))
output:
date volume open high low close
1: 2021-11-26 2 0.43 0.43 0.43 0.43
2: 2021-11-24 10 0.17 0.17 0.17 0.17
3: 2021-11-26 75 0.19 0.19 0.19 0.19
4: 2021-11-24 1 0.15 0.15 0.15 0.15
Upvotes: 2