Reputation: 1800
I have some trouble with which.min
function inside a dplyr pipe
I have a cumbersome solution (*)
and I'm looking form more compact and elegant way to do this
library(dplyr)
data=data.frame(s1=c(10,NA,5,NA,NA),s2=c(8,NA,NA,4,20),s3=c(NA,NA,2,NA,10))
data
#> s1 s2 s3
#> 1 10 8 NA
#> 2 NA NA NA
#> 3 5 NA 2
#> 4 NA 4 NA
#> 5 NA 20 10
here with min(x,na.rm=TRUE)
I could extract the min value
data%>%
rowwise()%>%
mutate(Min_s=min(c(s1,s2,s3),na.rm=TRUE))
#> Warning: There was 1 warning in `mutate()`.
#> ℹ In argument: `Min_s = min(c(s1, s2, s3), na.rm = TRUE)`.
#> ℹ In row 2.
#> Caused by warning in `min()`:
#> ! no non-missing arguments to min; returning Inf
#> # A tibble: 5 × 4
#> # Rowwise:
#> s1 s2 s3 Min_s
#> <dbl> <dbl> <dbl> <dbl>
#> 1 10 8 NA 8
#> 2 NA NA NA Inf
#> 3 5 NA 2 2
#> 4 NA 4 NA 4
#> 5 NA 20 10 10
Here I'm having trouble extracting which variable contain the min value
data%>%
rowwise()%>%
mutate(which_s=which.min(c(s1,s2,s3)))
#> Error in `mutate()`:
#> ℹ In argument: `which_s = which.min(c(s1, s2, s3))`.
#> ℹ In row 2.
#> Caused by error:
#> ! `which_s` must be size 1, not 0.
#> ℹ Did you mean: `which_s = list(which.min(c(s1, s2, s3)))` ?
# Solution (*)
data%>%
rowwise()%>%
mutate(which_s=if(!is.na(s1)|!is.na(s2)|!is.na(s3)) {which.min(c(s1,s2,s3))} else NA )
#> # A tibble: 5 × 4
#> # Rowwise:
#> s1 s2 s3 which_s
#> <dbl> <dbl> <dbl> <int>
#> 1 10 8 NA 2
#> 2 NA NA NA NA
#> 3 5 NA 2 3
#> 4 NA 4 NA 2
#> 5 NA 20 10 3
Created on 2024-11-07 with reprex v2.1.0
Upvotes: 4
Views: 96
Reputation: 7979
I sometimes miss a good row.which.min
function. This is far from good and not harmonised to work (well) with {dplyr}
-language, but might help here.
v0
row.which.min = \(.data, .cols, .names = FALSE, tm = "first") {
if(missing(.cols)) .cols = names(.data)
x = .data[.cols]
i = rowSums(is.na(x)) < length(.cols)
nx = -x[i, ]
nx[is.na(nx)] = -Inf
y = rep(NA, nrow(.data))
y[i] = max.col(nx, tm)
if(!.names) y else names(.data)[y]
}
giving
> df0 = data.frame(s1=c(10,NA,5,NA,NA),s2=c(8,NA,NA,4,20),s3=c(NA,NA,2,NA,10))
> row.which.min(df0, .names = TRUE)
[1] "s2" NA "s3" "s2" "s3"
Upvotes: 2
Reputation: 17656
Without using rowwise()
, you could do this in either base R or a single mutate()
step using purrr::pmap_chr()
:
Base R:
data$min_base <- unlist(apply(data, 1, \(x) ifelse(all(is.na(x)), NA, names(data)[which.min(x)])))
dplyr
/purrr
library(dplyr)
data <- data %>%
mutate(min_dplyr = purrr::pmap_chr(select(., s1:s3), \(...) {
ifelse(all(is.na(c(...))), NA, colnames(data)[which.min(c(...))])
}))
Output:
# s1 s2 s3 min_base min_dplyr
# 1 10 8 NA s2 s2
# 2 NA NA NA <NA> <NA>
# 3 5 NA 2 s3 s3
# 4 NA 4 NA s2 s2
# 5 NA 20 10 s3 s3
Note that among these answers, the base R custom function by @friede is substantially faster, followed by this base R arroach:
bigdata <- data[rep(seq_len(nrow(data)), 1e5),]
microbenchmark::microbenchmark(
rowwise = bigdata %>%
rowwise() %>%
mutate(which_s = list(which.min(c(s1, s2, s3)))) %>%
tidyr::unnest(which_s, keep_empty = TRUE),
base = unlist(apply(bigdata, 1, \(x) ifelse(all(is.na(x)), NA, names(bigdata)[which.min(x)]))),
pmap = bigdata %>%
mutate(min_dplyr = purrr::pmap_chr(select(., s1:s3), \(...) {
ifelse(all(is.na(c(...))), NA, colnames(bigdata)[which.min(c(...))])
})),
custom_row.which.min = row.which.min(bigdata, names = TRUE, ties="first")
)
# expr min lq mean median uq max neval cld
# rowwise 3730.8131 4512.870 6018.3180 4985.6024 5913.5166 53501.838 100 a
# base 2419.1913 3162.745 4309.7700 3557.7805 4427.4588 32814.209 100 b
# pmap 3837.8870 4593.846 6091.5265 5203.0391 5984.0412 22015.418 100 a
# custom_row.which.min 108.4075 147.695 221.7602 168.5267 240.6043 1419.106 100 c
Upvotes: 2
Reputation: 102529
In your second row, you will obtain integer(0)
in the column which_s
, and that's the point you cannot run it without errors.
Instead, you could first store the results in a list, and then unnest
(don't forget to enable keep_empty
argument in unnest
)
data %>%
rowwise() %>%
mutate(which_s = list(which.min(c(s1, s2, s3)))) %>%
unnest(which_s, keep_empty = TRUE)
which gives
# A tibble: 5 × 4
s1 s2 s3 which_s
<dbl> <dbl> <dbl> <int>
1 10 8 NA 2
2 NA NA NA NA
3 5 NA 2 3
4 NA 4 NA 2
5 NA 20 10 3
Upvotes: 7