Matias Andina
Matias Andina

Reputation: 4230

Odd behavior of dplyr::between and filter

I have a data.frame I want to filter based on whether the range from low to high contains zero. Here's an example

head(toy)
# A tibble: 6 x 3
  difference     low  high
       <dbl>   <dbl> <dbl>
1     0.0161 -0.143  0.119
2     0.330   0.0678 0.656
3     0.205  -0.103  0.596
4     0.521   0.230  0.977
5     0.328   0.177  0.391
6    -0.0808 -0.367  0.200

I could swear I have used dplyr::between() to do this kind of filtering operation a million times (even with columns of class datetime, where it warns about S3 objects). But I can't find what's wrong with this one.

# Does does not find anything
toy %>%
  filter(!dplyr::between(0, low, high))

# Maybe it's because it needs `x` to be a vector, using mutate
# Does not find anything
toy %>%
  mutate(zero = 0) %>% 
  filter(!dplyr::between(zero, low, high))

# if we check the logic, all "keep" go to FALSE
toy %>% 
  mutate(zero = 0, 
         keep = !dplyr::between(zero, low, high)) 

# data.table::between works
toy %>% 
  filter(!data.table::between(0, low, high))

# regular logic works
toy %>% 
  filter(low > 0 | high < 0)

The data below:

> dput(toy)
structure(list(difference = c(0.0161058505175378, 0.329976207353122, 
                              0.20517072042705, 0.520837282826481, 0.328289597476641, -0.0807728725339096, 
                              0.660320444135006, 0.310679750033675, -0.743294517440579, -0.00665462977775899, 
                              0.0890903981794149, 0.0643321993757249, 0.157453334405998, 0.107320325893175, 
                              -0.253664041938671, -0.104025850079389, -0.284835573264143, -0.330557762091307, 
                              -0.0300387610595219, 0.081297046765014), low = c(-0.143002432870633, 
                                                                               0.0677907794288728, -0.103344717845837, 0.229753302951895, 0.176601773133456, 
                                                                               -0.366899428200429, 0.403702557199546, 0.0216878391530755, -1.01129163487875, 
                                                                               -0.222395625167488, -0.135193611295608, -0.116654715121314, -0.168581379777843, 
                                                                               -0.281919444558125, -0.605918194917671, -0.364539852350809, -0.500147478407119, 
                                                                               -0.505906196974183, -0.233810558283787, -0.193048952382206), 
               high = c(0.118860787421672, 0.655558974886329, 0.595905673925067, 
                        0.97748896372657, 0.391043536410999, 0.199727242557477, 0.914173497837859, 
                        0.633804982827898, -0.549942089679123, 0.19745782761473, 
                        0.340823604797603, 0.317956343103116, 0.501279107093568, 
                        0.442497779066522, 0.0721480109893818, 0.280593530192991, 
                        -0.0434862536882377, -0.229723776097642, 0.22550243301984, 
                        0.252686968655449)), row.names = c(NA, -20L), class = c("tbl_df", 
                                                                                "tbl", "data.frame"))

Just in case somebody finds it useful

> "between" %in% conflicts()
[1] FALSE
> packageVersion("dplyr")
[1] ‘1.0.2’

Upvotes: 1

Views: 38

Answers (2)

akrun
akrun

Reputation: 887511

We could use map2

library(dplyr)
library(purrr)
toy %>%
   filter(!map2_lgl(low, high, ~ between(0, .x, .y)))

-output

# A tibble: 8 x 3
  difference     low    high
       <dbl>   <dbl>   <dbl>
1      0.330  0.0678  0.656 
2      0.521  0.230   0.977 
3      0.328  0.177   0.391 
4      0.660  0.404   0.914 
5      0.311  0.0217  0.634 
6     -0.743 -1.01   -0.550 
7     -0.285 -0.500  -0.0435
8     -0.331 -0.506  -0.230 

Upvotes: 1

tmfmnk
tmfmnk

Reputation: 40131

dplyr::between() is not vectorized. One thing you could do is:

df %>%
 rowwise() %>%
 filter(!dplyr::between(0, low, high))

  difference     low    high
       <dbl>   <dbl>   <dbl>
1      0.330  0.0678  0.656 
2      0.521  0.230   0.977 
3      0.328  0.177   0.391 
4      0.660  0.404   0.914 
5      0.311  0.0217  0.634 
6     -0.743 -1.01   -0.550 
7     -0.285 -0.500  -0.0435
8     -0.331 -0.506  -0.230 

data.table::between() is vectorized: that's the reason why it works.

Upvotes: 2

Related Questions