Saurabh
Saurabh

Reputation: 1626

How to remove NaN and Inf values from data.table where all columns are character types in R

I have a data.table as follows -

data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN", 
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN", 
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17", 
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN", 
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", 
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L, 
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))

I want to remove all the NaN and Inf values from this data.table.

     date      open high  low close volume
 1: 2021-11-24  NaN  NaN  NaN   NaN      0
 2: 2021-11-24  NaN  NaN  NaN   NaN      0
 3: 2021-11-26 0.43 0.43 0.43  0.43      2
 4: 2021-11-24 0.17 0.17 0.17  0.17     10
 5: 2021-11-26 0.19 0.19 0.19  0.19     75
 6: 2021-11-24 0.15 0.15 0.15  0.15      1
 7: 2021-11-24  NaN  NaN  NaN   NaN      0
 8: 2021-11-26  NaN  NaN  NaN   NaN      0
 9: 2021-11-26  NaN  NaN  NaN   NaN      0
10: 2021-11-26  NaN  NaN  NaN   NaN      0
11: 2021-11-26  NaN  NaN  NaN   NaN      0

All the columns open, high, low, close are character types due to NaN values.

Is there a quick way to remove NaNs directly in data.table?

Here is the performance of each solution given below -

p_load(dtplyr, dplyr)
microbenchmark::microbenchmark(
  
  user438383 = data[!unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])],
  
  langtang = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5])),
  
  akrun  = {data <- type.convert(data, as.is = TRUE);
  data[data[, Reduce(`&`, lapply(.SD, function(x)
    !is.nan(x) & is.finite(x))), .SDcols = -1]]},

  paul = {data <- type.convert(data, as.is = TRUE);
  data[data[,is.finite(rowSums(.SD)), .SDcols=-1]]},
  
  Macosso = {data$Row <- row.names(data);
  rm_rw <- data[apply(data, 1, 
                      function(X) any(X== "NaN"|X== "Inf")),] %>% dplyr::pull(Row);
  data[!row.names(data) %in% rm_rw ,] %>% dplyr::select(-Row)} 
)

Unit: microseconds
       expr      min       lq      mean   median       uq       max neval  cld
 user438383  893.843  931.243  976.4554  974.011 1005.673  1093.929   100 a   
   langtang 2694.987 2779.411 2904.5124 2877.927 3003.832  3420.539   100   c 
      akrun 1664.476 1694.780 2253.8962 1731.392 1838.755 26035.268   100  b  
       paul 1663.552 1718.956 1792.2313 1770.511 1843.051  2151.975   100  b  
    Macosso 5899.961 6140.244 6429.9634 6368.072 6604.615  8180.782   100    d

Upvotes: 1

Views: 1338

Answers (5)

user438383
user438383

Reputation: 6206

One way would be to find the index of the rows containing NaN:

unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])
[1]  1  2  7  8  9 10 11

And then set a logical condition to remove these rows:

data[!unique(which(data == "NaN" | data == "Inf", arr.ind=T)[,1])]
         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43  0.43      2
2: 2021-11-24 0.17 0.17 0.17  0.17     10
3: 2021-11-26 0.19 0.19 0.19  0.19     75
4: 2021-11-24 0.15 0.15 0.15  0.15      1

Some benchmarks

Unit: milliseconds
     expr        min         lq       mean     median         uq       max neval  cld
       me   4.513141   5.545293   7.068744   6.707279   8.356170  31.30188   100 a   
 langtang   3.535727   3.646819   8.718629   6.318445   6.983275  59.76049   100 a   
    akrun  51.169168 195.102026 208.889413 204.564707 216.545022 274.02575   100   c 
     paul  11.235627 145.195062 146.721146 146.670909 148.432261 200.56718   100  b  
  Macosso 370.269687 448.143027 468.074160 457.499264 497.636319 553.70491   100    d
data = structure(list(date = c("2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-24", "2021-11-26", "2021-11-24", "2021-11-24", "2021-11-26", 
"2021-11-26", "2021-11-26", "2021-11-26"), open = c("NaN", "NaN", 
"0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"
), high = c("NaN", "NaN", "0.43", "0.17", "0.19", "0.15", "NaN", 
"NaN", "NaN", "NaN", "NaN"), low = c("NaN", "NaN", "0.43", "0.17", 
"0.19", "0.15", "NaN", "NaN", "NaN", "NaN", "NaN"), close = c("NaN", 
"NaN", "0.43", "0.17", "0.19", "0.15", "NaN", "NaN", "NaN", "NaN", 
"NaN"), volume = c(0L, 0L, 2L, 10L, 75L, 1L, 0L, 0L, 0L, 0L, 
0L)), row.names = c(NA, -11L), class = c("data.table", "data.frame"
))
data = do.call("rbind", replicate(1000, data, simplify = FALSE))

library(dtplyr)

res = microbenchmark::microbenchmark(
    me = data[!unique(which(data == NaN, arr.ind=T)[,1])],

    langtang = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5])),

    akrun  = {data <- type.convert(data, as.is = TRUE);
    data[data[, Reduce(`&`, lapply(.SD, function(x)
         !is.nan(x) & is.finite(x))), .SDcols = -1]]},

    paul = data %>% 
        lazy_dt %>%  
        filter(across(2:5, ~ .x != "NaN")) %>% 
        as.data.table,

    Macosso = {data$Row <- row.names(data);
            rm_rw <- data[apply(data, 1, 
                    function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row);
            data[!row.names(data) %in% rm_rw ,] %>% select(-Row)
        } 

    )

Upvotes: 3

PaulS
PaulS

Reputation: 25323

A solution based on dtplyr:

library(dtplyr)
library(dplyr)
library(data.table)

data <- structure(
list(date=c("2021-11-24","2021-11-24","2021-11-26",
"2021-11-24","2021-11-26","2021-11-24",
"2021-11-24","2021-11-26","2021-11-26",
"2021-11-26","2021-11-26"),
open=c("NaN","NaN","0.43","0.17","0.19","0.15",
"NaN","NaN","NaN","NaN","NaN"),
high=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN",
"NaN","NaN","NaN","NaN"),low=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),close=c("NaN","NaN","0.43","0.17","0.19","0.15","NaN","NaN","NaN","NaN","NaN"),volume=c(0L,0L,2L,10L,75L,1L,0L,0L,0L,0L,0L)),row.names=c(NA,-11L),class=c("data.table","data.frame"))

data %>% 
  lazy_dt %>%  
  filter(across(2:5, ~ !.x %in% c("NaN","Inf"))) %>%
  as.data.table

#>          date open high  low close volume
#> 1: 2021-11-26 0.43 0.43 0.43  0.43      2
#> 2: 2021-11-24 0.17 0.17 0.17  0.17     10
#> 3: 2021-11-26 0.19 0.19 0.19  0.19     75
#> 4: 2021-11-24 0.15 0.15 0.15  0.15      1

In case the NaN and Inf are not character type, as @akrun mentions in his answer, then the following is also a solution:

library(data.table)

# No need of this, if NaN e Inf are not strings
data <- type.convert(data, as.is = TRUE)

data[data[,is.finite(rowSums(.SD)), .SDcols=-1]]

#>          date open high  low close volume
#> 1: 2021-11-26 0.43 0.43 0.43  0.43      2
#> 2: 2021-11-24 0.17 0.17 0.17  0.17     10
#> 3: 2021-11-26 0.19 0.19 0.19  0.19     75
#> 4: 2021-11-24 0.15 0.15 0.15  0.15      1

Upvotes: 1

akrun
akrun

Reputation: 886938

The NaN created was quoted, so the columns were unnecessarily type changed to character.

> str(data)
Classes ‘data.table’ and 'data.frame':  11 obs. of  6 variables:
 $ date  : chr  "2021-11-24" "2021-11-24" "2021-11-26" "2021-11-24" ...
 $ open  : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ high  : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ low   : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ close : chr  "NaN" "NaN" "0.43" "0.17" ...
 $ volume: int  0 0 2 10 75 1 0 0 0 0 ...

We may need to convert the type automatically and then use data.table methods - loop over the columns other than the 'date' by specifying the .SDcols, create the logical expression i.e. the column values are not NaN (!is.nan) and (&) is finite (is.finite), Reduce the logical vectors to a single vector with & and subset the rows

library(data.table)
data <- type.convert(data, as.is = TRUE)
out <- data[data[, Reduce(`&`, lapply(.SD, function(x)
     !is.nan(x) & is.finite(x))), .SDcols = -1]]
out
         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43  0.43      2
2: 2021-11-24 0.17 0.17 0.17  0.17     10
3: 2021-11-26 0.19 0.19 0.19  0.19     75
4: 2021-11-24 0.15 0.15 0.15  0.15      1

Upvotes: 2

Macosso
Macosso

Reputation: 1439

The optimal strategy would be, to get the index of rows with NaN then filter out those indexes.

library(dplyr)

data$Row <- row.names(data)
rm_rw <- data[apply(data, 1, 
                    function(X) any(X== "NaN"|X== "Inf")),] %>% pull(Row)
data[!row.names(data) %in% rm_rw ,] %>% select(-Row)

         date open high  low close volume
1: 2021-11-26 0.43 0.43 0.43  0.43      2
2: 2021-11-24 0.17 0.17 0.17  0.17     10
3: 2021-11-26 0.19 0.19 0.19  0.19     75
4: 2021-11-24 0.15 0.15 0.15  0.15      1

Update 1
Changed any(X== "NaN")) to any(X== "NaN"|X== "Inf")) so that Inf can also be filtered out

Upvotes: 2

langtang
langtang

Reputation: 24722

Could you just us as.numeric to convert?

result = na.omit(cbind(data[, .(date,volume)], data[, lapply(.SD, as.numeric), .SDcols = 2:5]))

output:

         date volume open high  low close
1: 2021-11-26      2 0.43 0.43 0.43  0.43
2: 2021-11-24     10 0.17 0.17 0.17  0.17
3: 2021-11-26     75 0.19 0.19 0.19  0.19
4: 2021-11-24      1 0.15 0.15 0.15  0.15

Upvotes: 2

Related Questions