Orion
Orion

Reputation: 1104

Get index of data.table column that matches a value

DT = data.table(
      id = 1:5,
      a  = c(0,1,0,2,5),
      b  = c(1,0,2,4,4),
      c  = c(1,2,0,0,5))

#     id a b c
# 1:  1  0 1 1
# 2:  2  1 0 2
# 3:  3  0 2 0
# 4:  4  2 4 0
# 5:  5  5 4 5

I want to identify the first column from the left which has 0, and put the column index in idx.

#     id a b c idx
# 1:  1  0 1 1 2 
# 2:  2  1 0 2 3
# 3:  3  0 2 0 2
# 4:  4  2 4 0 4
# 5:  5  5 4 5 NA

(Non-data.table solutions, e.g., with dplyr are also welcome)

Upvotes: 4

Views: 976

Answers (5)

s_baldur
s_baldur

Reputation: 33498

DT[, idx := which(.SD == 0)[1] + 1L, by = id]
DT
#    id a b c idx
# 1:  1 0 1 1   2
# 2:  2 1 0 2   3
# 3:  3 0 2 0   2
# 4:  4 2 4 0   4
# 5:  5 5 4 5  NA

Upvotes: 3

chinsoon12
chinsoon12

Reputation: 25225

Too long in a comment, hence comm wiki. Similar to Sotos and tmfmnk, if value will not be found in id, then you can avoid the rowSums:

DT[, idx := {
    x <- max.col(.SD==0, "first")
    replace(x, x==1L, NA_integer_)
}]

timing code:

library(data.table)
set.seed(0L)
nr <- 1e7
DT <- data.table(id=1:nr, a=sample(0:5, nr, TRUE), b=sample(0:5, nr, TRUE), c=sample(0:5, nr, TRUE))
DT0 <- copy(DT)
DT1 <- copy(DT)
DT2 <- copy(DT)
DT3 <- copy(DT)
DT4 <- copy(DT)

mtd0 <- function() {
    replace(max.col(-DT0[, -1] == 0, ties.method = 'first') + 1, rowSums(DT == 0) == 0, NA)

    #or break it into two lines If you want,
    i1 <- max.col(-DT0[,-1] == 0, ties.method = 'first') + 1
    replace(i1, rowSums(DT0 == 0) == 0, NA)
}

mtd1 <- function() {
    DT1[, idx := apply(.SD, 1, function(x) first(which(x == 0 )))]
}

mtd2 <- function() {
    DT2[, idx := which(.SD == 0)[1], by = id]
}

mtd3 <- function() {
    DT2[, idx := fifelse(rowSums(.SD == 0) == 0,
        NA_integer_,
        max.col(-.SD, ties.method = "first"))]
}

mtd4 <- function() {
    DT4[, idx := {
        x <- max.col(.SD==0, "first")
        replace(x, x==1L, NA_integer_)
    }]
}

bench::mark(mtd0(), mtd1(), mtd2(), 
    mtd3(), mtd4(), check=FALSE)

timings:

# A tibble: 5 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result    memory    time   gc    
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>    <list>    <list> <list>
1 mtd0()     136.98ms 145.67ms    5.41     347.8MB    1.80      3     1    554.1ms <dbl [1,~ <df[,3] ~ <bch:~ <tibb~
2 mtd1()        8.66s    8.66s    0.115     76.3MB    0.346     1     3      8.66s <df[,5] ~ <df[,3] ~ <bch:~ <tibb~
3 mtd2()        57.3s    57.3s    0.0175    22.6MB    0.436     1    25      57.3s <df[,5] ~ <df[,3] ~ <bch:~ <tibb~
4 mtd3()       86.6ms  90.69ms   10.3      171.7MB    1.72      6     1   582.21ms <df[,5] ~ <df[,3] ~ <bch:~ <tibb~
5 mtd4()       48.9ms  50.12ms   18.4       97.6MB    1.84     10     1   544.43ms <df[,5] ~ <df[,3] ~ <bch:~ <tibb~

Upvotes: 0

tmfmnk
tmfmnk

Reputation: 39858

One dplyr possibility could be:

DT %>%
 mutate(idx = if_else(rowSums(. == 0) == 0,
                      NA_integer_,
                      max.col(- ., ties.method = "first")))

  id a b c idx
1  1 0 1 1   2
2  2 1 0 2   3
3  3 0 2 0   2
4  4 2 4 0   4
5  5 5 4 5  NA

And the same in data.table:

DT[, idx := ifelse(rowSums(.SD == 0) == 0,
                   NA_integer_,
                   max.col(- .SD, ties.method = "first"))]

Upvotes: 2

Sotos
Sotos

Reputation: 51582

An idea via base R can be,

replace(max.col(-DT[, -1] == 0, ties.method = 'first') + 1, rowSums(DT == 0) == 0, NA)

#or break it into two lines If you want,
i1 <- max.col(-DT[,-1] == 0, ties.method = 'first') + 1
replace(i1, rowSums(DT == 0) == 0, NA)

#[1]  2  3  2  4 NA

Upvotes: 3

Matt
Matt

Reputation: 2987

A data.table solution could be:

DT[, idx := apply(.SD, 1, function(x) first(which(x == 0 )))]

#   id a b c idx
#1:  1 0 1 1   2
#2:  2 1 0 2   3
#3:  3 0 2 0   2
#4:  4 2 4 0   4
#5:  5 5 4 5  NA

Upvotes: 2

Related Questions