Reputation: 178
My data set looks like this one:
library(data.table)
df <- data.table(a = c(1,2,3,4,5),
b = c(1,0,2,5,1),
c = c(0,1,1,0,0),
d = c(1,0,0,2,2))
df
# a b c d
# 1: 1 1 0 1
# 2: 2 0 1 0
# 3: 3 2 1 0
# 4: 4 5 0 2
# 5: 5 1 0 2
I want to create a new column with non-null columns names. The result will be:
df_result <- data.table(a = c(1,2,3,4,5),
z = c('b_d', 'c', 'b_c', 'b_d', 'b_d'))
df_result
# a z
# 1: 1 b_d
# 2: 2 c
# 3: 3 b_c
# 4: 4 b_d
# 5: 5 b_d
Upvotes: 6
Views: 614
Reputation: 13122
Assuming nrow >> ncol
, you could work columnwise
ff = function(x)
{
ans = character(nrow(x))
for(j in seq_along(x)) {
i = x[[j]] > 0L
ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
}
return(gsub("^_", "", ans))
}
ff(df[, -1L, with = FALSE]) #or, `df[, ff(.SD), .SDcols = -1L]` from David Arenburg
#[1] "b_d" "c" "b_c" "b_d" "b_d"
Upvotes: 12
Reputation: 887901
One option would be to convert the format from 'wide' to 'long' using melt
. Grouped by 'a', we paste
the 'variable' elements that corresponds to non-zero elements in 'value' (provided as logical condition in 'i').
melt(df, id.var='a')[value!=0,
.(z=paste(variable, collapse="_")), keyby =a]
# a z
#1: 1 b_d
#2: 2 c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d
Or instead of melt
ing, we can group by 'a', unlist
the Subset of Data.table (.SD
) and paste
the names
of the columns that corresponds to non-zero elements ('i1').
df[, {i1 <- !!unlist(.SD)
paste(names(.SD)[i1], collapse="_")} , by= a]
set.seed(24)
df1 <- data.table(a=1:1e6, b = sample(0:5, 1e6,
replace=TRUE), c = sample(0:4, 1e6, replace=TRUE),
d = sample(0:3, 1e6, replace=TRUE))
akrun1 <- function() {
melt(df1, id.var='a')[value!=0,
.(z=paste(variable, collapse="_")), keyby =a]
}
akrun2 <- function() {
df1[, {i1 <- !!unlist(.SD)
paste(names(.SD)[i1], collapse="_")} , by= a]
}
ronak <- function() {
data.table(z = lapply(apply(df1, 1, function(x)
which(x[-1]!= 0)),
function(x) paste0(names(x), collapse = "_")))
}
eddi <- function(){
df1[, newcol := gsub("NA_|_NA|NA", "",
do.call(function(...) paste(..., sep = "_"),
Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD)))
, .SDcols = b:d]
}
alexis = function(x)
{
ans = character(nrow(x))
for(j in seq_along(x)) {
i = x[[j]] > 0L
ans[i] = paste(ans[i], names(x)[[j]], sep = "_")
}
return(gsub("^_", "", ans))
}
system.time(akrun1())
# user system elapsed
# 22.04 0.15 22.36
system.time(akrun2())
# user system elapsed
# 26.33 0.00 26.41
system.time(ronak())
# user system elapsed
# 25.60 0.26 25.96
system.time(alexis(df1[, -1L, with = FALSE]))
# user system elapsed
# 1.92 0.06 2.09
system.time(eddi())
# user system elapsed
# 2.41 0.06 3.19
Upvotes: 8
Reputation: 49448
Here's a direct approach:
df[, newcol := gsub("NA_|_NA|NA", "", # remove unwanted text
do.call(function(...) paste(..., sep = "_"), # paste colnames together
Map(function(x, y) x[(y == 0) + 1], names(.SD), .SD))) # convert data to colnames
, .SDcols = b:d]
# a b c d newcol
#1: 1 1 0 1 b_d
#2: 2 0 1 0 c
#3: 3 2 1 0 b_c
#4: 4 5 0 2 b_d
#5: 5 1 0 2 b_d
On akrun's test data it's >10x faster.
Upvotes: 8
Reputation: 389275
This can be a bit lengthy.
For every row finding a column whose value is not 0 and then pasting the column names together.
data.table(a= df$a, z = lapply(apply(df, 1,
function(x) which(x[-1]!= 0)),
function(x) paste0(names(x), collapse = "_")))
# a z
#1: 1 b_d
#2: 2 c
#3: 3 b_c
#4: 4 b_d
#5: 5 b_d
Upvotes: 4