Reputation: 107
I'm looking to count how many values in a row are identical. The idea is to be able to filter out respondents that have straightlined (ie answered all questions the same), for instance by filtering out those who have more than 90% identical answers across the cols.
I've come up with the code below, which works, but is slow on large datasets. The example below has 5 columns and 1000 rows, but my real data has 30 cols and 200.000 rows.
# define function
count_identical_values <- function(df){
columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
df %>%
rowwise() %>%
mutate(identical_count = case_when(
# all NA, then NA
all(is.na(c_across(all_of(columns)))) ~ NA_real_,
# else, count the number of identical values
TRUE ~ max(table(c_across(all_of(columns)))))) %>%
ungroup()
}
# make df
df = data.frame(statement_1 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1000, replace = TRUE))
# apply function
df = count_identical_values(df)
If anyone has an idea for how to speed this up, that would be great!
Upvotes: 5
Views: 128
Reputation: 72583
Another implementation of row-wise tabulate using matrixStats
. You can wrap the logic to filter out the straight answerers over a specific threshold in a function:
> fltr_straight <- \(x, threshold=.9) {
+ mx <- as.matrix(x) |> matrixStats::rowTabulates() |> matrixStats::rowMaxs()
+ mx/ncol(x) <= threshold
+ }
> df_fltr <- df[fltr_straight(df[1:5]), ]
Check which guys have been filtered out:
> setdiff(rownames(df), rownames(df_fltr))
[1] "171" "261" "287" "586"
Data:
set.seed(63572972)
df <- data.frame(
matrix(sample(c(NA, 1:5), 1000*5, replace=TRUE),
1000, 5)
)
Upvotes: 1
Reputation: 16971
Below is a vectorized function that uses collapse::fmode
. It will process your "real" dataset in less than half a second and can handle any data type. It wasn't clear if NA
should be included in the count, hence the incl.na
argument.
rowsames <- function(df, incl.na = TRUE) {
u <- unlist(df, 0, 0)
m <- match(u, unique(u), NA_integer_, if (incl.na) NULL else NA)
rowSums(`dim<-`(fmode(m, sequence(rep.int(nrow(df), ncol(df))), na.rm = TRUE,
use.g.names = FALSE) == m, dim(df)), na.rm = TRUE)
}
Applying it on the example data.frame
:
set.seed(1548243204)
df = data.frame(statement_1 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1000, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1000, replace = TRUE))
rowsames(df)[1:10]
#> [1] 2 3 3 2 3 2 2 2 3 2
rowsames(df, FALSE)[1:10]
#> [1] 2 3 3 2 3 2 2 2 3 1
Note NA
is the most frequent value in the 10th row, so rowsames
returns 2
for that row if incl.na
is set to TRUE
and 1
otherwise.
df[1:10,]
#> statement_1 statement_2 statement_3 statement_4 statement_5
#> 1 5 5 4 NA 2
#> 2 4 3 5 5 5
#> 3 5 5 NA 5 NA
#> 4 4 5 2 1 5
#> 5 3 4 4 4 3
#> 6 3 NA 3 1 1
#> 7 NA 4 NA 2 4
#> 8 NA 1 4 3 3
#> 9 4 2 4 3 4
#> 10 2 NA NA 4 5
my real data has 30 cols and 200.000 rows
df <- as.data.frame(matrix(sample(c(NA, 1:5), 6e6, 1), 2e5, 30, 0,
list(NULL, paste0("statement_", 1:30))))
system.time(rowsames(df))
#> user system elapsed
#> 0.41 0.03 0.43
Extending @RonakShah's benchmarking:
set.seed(3452)
df = data.frame(statement_1 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1e5, replace = TRUE))
fun_dplyr <- function(df) {
columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
df %>%
rowwise() %>%
mutate(identical_count = case_when(
# all NA, then NA
all(is.na(c_across(all_of(columns)))) ~ NA_real_,
# else, count the number of identical values
TRUE ~ max(table(c_across(all_of(columns)))))) %>%
ungroup()
}
fun_apply <- function(df) {
apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))
}
fun_dapply <- function(df) {
collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)
}
Timings:
microbenchmark::microbenchmark(
# fun_dplyr = fun_dplyr(df), # too slow
fun_apply = fun_apply(df),
fun_dapply = fun_dapply(df),
rowsames = rowsames(df)
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> fun_apply 298.4000 327.0602 355.26551 352.78755 379.27705 433.5157 100
#> fun_dapply 235.7283 261.1802 300.46456 287.33650 335.46850 442.8762 100
#> rowsames 34.4906 36.1296 37.61214 37.16805 38.68065 47.0696 100
Upvotes: 3
Reputation: 388797
You may use collapse::dapply
which will be faster.
collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)
to handle all NA
values you may use hablar::max_
function which returns NA
when all the values are NA
.
Benchmarks :
set.seed(3452)
df = data.frame(statement_1 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_2 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_3 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_4 = sample(c(NA, 1:5), 1e5, replace = TRUE),
statement_5 = sample(c(NA, 1:5), 1e5, replace = TRUE))
fun_dplyr <- function(df) {
columns = c("statement_1", "statement_2", "statement_3", "statement_4", "statement_5")
df %>%
rowwise() %>%
mutate(identical_count = case_when(
# all NA, then NA
all(is.na(c_across(all_of(columns)))) ~ NA_real_,
# else, count the number of identical values
TRUE ~ max(table(c_across(all_of(columns)))))) %>%
ungroup()
}
fun_apply <- function(df) {
apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))
}
fun_dapply <- function(df) {
collapse::dapply(df, \(x) max(tabulate(x)), MARGIN = 1)
}
microbenchmark::microbenchmark(
fun_dplyr = fun_dplyr(df),
fun_apply = fun_apply(df),
fun_dapply = fun_dapply(df),
times = 10L
)
which returns
Unit: milliseconds
expr min lq mean median uq max neval
fun_dplyr 125572.7109 126954.4355 131877.4115 133178.7886 135347.5074 136411.1468 10
fun_apply 413.7097 439.3520 476.6822 456.8837 507.4259 583.4736 10
fun_dapply 293.8734 315.4473 380.0586 336.0351 363.5058 629.0772 10
Upvotes: 2
Reputation: 226007
Is
apply(as.matrix(df), MARGIN=1, FUN=\(x) max(tabulate(x)))
faster? (Would need some work to handle NA
values, but rowSums(is.na(as.matrix(df))) == ncol(df)
might find those fairly quickly ...)
Upvotes: 3