user3600910
user3600910

Reputation: 3109

Get indices from each row and merge with original data.frame

I have the following data.frame

                   user_id 1 2 3 4 5 6 7 8 9
1           54449024717783 0 0 1 0 0 0 0 0 0
2          117592134783793 0 0 0 0 0 1 0 0 0
3          187145545782493 0 0 1 0 0 0 0 0 0
4          245003020993334 0 0 0 0 0 1 0 0 0
5          332625230637592 0 1 0 0 0 0 0 0 0
6          336336752713947 0 1 0 0 0 0 0 0 0

what I would like to do is to create one column (and remove 1:9) and insert the column name where I have the value 1 , each user contain only column with the value 1 ,

If im running the following function:

rowSums(users_cluster(users_cluster), dims = 1)

it will summarize all the rows value but I need to duplicate it with the column name

Upvotes: 3

Views: 161

Answers (6)

Uwe
Uwe

Reputation: 42564

For the sake of completeness, here is also a data.table solution which uses melt() to reshape from wide to long format:

library(data.table)
melt(setDT(DF), id = "user_id")[value == 1L][order(user_id), !"value"]

           user_id variable
1:  54449024717783        3
2: 117592134783793        6
3: 187145545782493        3
4: 245003020993334        6
5: 332625230637592        2
6: 336336752713947        2

This takes advantage of the fact that the sample dataset is already sorted by ascending user_id.

In case the sample dataset has a different order which should be maintained in the final result, it is necessary to remember that order by introducing a temporary row id:

melt(setDT(DF), id = "user_id")[, rn := rowid(variable)][value == 1L][
  order(rn), !c("rn", "value")]

or, alternatively,

melt(setDT(DF), id = "user_id")[, rn := rowid(variable)][, setorder(.SD, rn)][
  value == 1L, !c("rn", "value")]

Data

library(data.table)
DF <- fread(
  "i                   user_id 1 2 3 4 5 6 7 8 9
  1           54449024717783 0 0 1 0 0 0 0 0 0
  2          117592134783793 0 0 0 0 0 1 0 0 0
  3          187145545782493 0 0 1 0 0 0 0 0 0
  4          245003020993334 0 0 0 0 0 1 0 0 0
  5          332625230637592 0 1 0 0 0 0 0 0 0
  6          336336752713947 0 1 0 0 0 0 0 0 0"
, drop = 1L)[, lapply(.SD, as.integer), by = user_id]

Upvotes: 1

akrun
akrun

Reputation: 887511

Another approach is max.col from base R as the user specified each user contain only column with the value 1

cbind(dat[1], ind = max.col(dat[-1], 'first'))
#          user_id ind
#1  54449024717783   3
#2 117592134783793   6
#3 187145545782493   3
#4 245003020993334   6
#5 332625230637592   2
#6 336336752713947   2

Upvotes: 5

www
www

Reputation: 39154

A solution using the .

library(tidyverse)
dat2 <- dat %>%
  mutate(ID = 1:n()) %>%
  gather(Column, Value, -user_id, -ID) %>%
  filter(Value == 1) %>%
  arrange(ID) %>%
  select(-Value, -ID) %>%
  as.data.frame()
dat2
#           user_id Column
# 1  54449024717783      3
# 2 117592134783793      6
# 3 187145545782493      3
# 4 245003020993334      6
# 5 332625230637592      2
# 6 336336752713947      2

DATA

dat <- read.table(text = "                  user_id 1 2 3 4 5 6 7 8 9
1           54449024717783 0 0 1 0 0 0 0 0 0
2          117592134783793 0 0 0 0 0 1 0 0 0
3          187145545782493 0 0 1 0 0 0 0 0 0
4          245003020993334 0 0 0 0 0 1 0 0 0
5          332625230637592 0 1 0 0 0 0 0 0 0
6          336336752713947 0 1 0 0 0 0 0 0 0",
                  header = TRUE, stringsAsFactors = FALSE)

library(tidyverse)

dat <- as.tibble(dat) %>%
  setNames(sub("X", "", names(.))) %>%
  mutate(user_id = as.character(user_id))

Upvotes: 3

Florian
Florian

Reputation: 25395

Another base R solution:

df$ind = apply(df[,-1]>0,1,which)
df[,c("user_id","ind")]

Output:

       user_id ind
1 5.444902e+13   3
2 1.175921e+14   6
3 1.871455e+14   3
4 2.450030e+14   6
5 3.326252e+14   2
6 3.363368e+14   2

Upvotes: 3

mtoto
mtoto

Reputation: 24198

Here's another base R option:

inds <- which(df[,-1]!=0,TRUE)
df$newcol <- inds[order(row.names(inds)),][,2]

df[,c(1,11)]
#       user_id newcol
#1 5.444902e+13      3
#2 1.175921e+14      6
#3 1.871455e+14      3
#4 2.450030e+14      6
#5 3.326252e+14      2
#6 3.363368e+14      2

Upvotes: 5

pogibas
pogibas

Reputation: 28369

Base R solution:

data.frame(user_id = df[, 1],
           name = which(t(df[, -1] == 1)) %% (ncol(df) - 1))

#           user_id name
# 1  54449024717783    3
# 2 117592134783793    6
# 3 187145545782493    3
# 4 245003020993334    6
# 5 332625230637592    2
# 6 336336752713947    2

Upvotes: 8

Related Questions