Vitaliy  Radchenko
Vitaliy Radchenko

Reputation: 178

Create a new column with non-null columns' names

My data set looks like this one:

library(data.table)

df <- data.table(a = c(1,2,3,4,5),
                 b = c(1,0,2,5,1),
                 c = c(0,1,1,0,0),
                 d = c(1,0,0,2,2))

df
#    a b c d
# 1: 1 1 0 1
# 2: 2 0 1 0
# 3: 3 2 1 0
# 4: 4 5 0 2
# 5: 5 1 0 2

I want to create a new column with non-null columns names. The result will be:

df_result <- data.table(a = c(1,2,3,4,5),
                        z = c('b_d', 'c', 'b_c', 'b_d', 'b_d'))

df_result
#    a   z
# 1: 1 b_d
# 2: 2   c
# 3: 3 b_c
# 4: 4 b_d
# 5: 5 b_d

Upvotes: 6

Views: 614

Answers (4)

alexis_laz
alexis_laz

Reputation: 13122

Assuming nrow >> ncol, you could work columnwise

ff = function(x)
{
    ans = character(nrow(x))
    for(j in seq_along(x)) {
        i = x[[j]] > 0L
        ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
    }
    return(gsub("^_", "", ans))
}
ff(df[, -1L, with = FALSE]) #or, `df[, ff(.SD), .SDcols = -1L]` from David Arenburg
#[1] "b_d" "c"   "b_c" "b_d" "b_d"

Upvotes: 12

akrun
akrun

Reputation: 887901

One option would be to convert the format from 'wide' to 'long' using melt. Grouped by 'a', we paste the 'variable' elements that corresponds to non-zero elements in 'value' (provided as logical condition in 'i').

melt(df, id.var='a')[value!=0, 
      .(z=paste(variable, collapse="_")), keyby =a]
#   a   z
#1: 1 b_d
#2: 2   c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d

Or instead of melting, we can group by 'a', unlist the Subset of Data.table (.SD) and paste the names of the columns that corresponds to non-zero elements ('i1').

df[, {i1 <- !!unlist(.SD)
       paste(names(.SD)[i1], collapse="_")} , by= a]

Benchmarks

set.seed(24)
df1 <- data.table(a=1:1e6, b = sample(0:5, 1e6, 
   replace=TRUE), c = sample(0:4, 1e6, replace=TRUE), 
    d = sample(0:3, 1e6, replace=TRUE))

akrun1 <- function() {
   melt(df1, id.var='a')[value!=0, 
      .(z=paste(variable, collapse="_")), keyby =a]
    }

 akrun2 <- function() {
   df1[, {i1 <- !!unlist(.SD)
       paste(names(.SD)[i1], collapse="_")} , by= a]
   }

 ronak <- function() {
    data.table(z = lapply(apply(df1, 1, function(x)
                which(x[-1]!= 0)), 
       function(x) paste0(names(x), collapse = "_")))
   }

eddi <- function(){
 df1[, newcol := gsub("NA_|_NA|NA", "",                          
   do.call(function(...) paste(..., sep = "_"),            
     Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD)))
 , .SDcols = b:d]

 }

alexis = function(x)
   {
   ans = character(nrow(x))
   for(j in seq_along(x)) {
    i = x[[j]] > 0L
    ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
   }
  return(gsub("^_", "", ans))
}





system.time(akrun1())
#   user  system elapsed 
#  22.04    0.15   22.36 
 system.time(akrun2())
#   user  system elapsed 
# 26.33    0.00   26.41 
 system.time(ronak())
#   user  system elapsed 
#  25.60    0.26   25.96 


system.time(alexis(df1[, -1L, with = FALSE]))
#   user  system elapsed 
#   1.92    0.06    2.09 

system.time(eddi())
#  user  system elapsed 
#   2.41    0.06    3.19 

Upvotes: 8

eddi
eddi

Reputation: 49448

Here's a direct approach:

df[, newcol := gsub("NA_|_NA|NA", "",                           # remove unwanted text
       do.call(function(...) paste(..., sep = "_"),             # paste colnames together
         Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD))) # convert data to colnames
   , .SDcols = b:d]
#   a b c d newcol
#1: 1 1 0 1    b_d
#2: 2 0 1 0      c
#3: 3 2 1 0    b_c
#4: 4 5 0 2    b_d
#5: 5 1 0 2    b_d

On akrun's test data it's >10x faster.

Upvotes: 8

Ronak Shah
Ronak Shah

Reputation: 389275

This can be a bit lengthy.

For every row finding a column whose value is not 0 and then pasting the column names together.

data.table(a= df$a, z = lapply(apply(df, 1, 
           function(x) which(x[-1]!= 0)), 
           function(x) paste0(names(x), collapse = "_")))


#   a   z
#1: 1 b_d
#2: 2   c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d

Upvotes: 4

Related Questions