Amr Hassan
Amr Hassan

Reputation: 43

create matrix based on another matrix contain index of element

I have a matrix that contains indices of column ides to the element of matrix I want create it

>  index
         [,1] [,2] [,3] 
    [1,]    1    2    3    
    [2,]    1    2    5    
    [3,]    1    3    4    
    [4,]    1    3    5    
    [5,]    1    4    5    
    [6,]    2    3    5    
    [7,]    3    4    5    

example first row have the column id 1 ,2 , 3 that set to it value 1

second row have the column id 1 , 2 , 5 that set to it value 1

now I want to create the following matrix:


     a1 a2 a3  a4 a5
[1,]  1  1  1  0  0
[2,]  1  1  0  0  1
[3,]  1  0  1  1  0
[4,]  1  0  1  0  1
[5,]  1  0  0  1  1  
[6,]  0  1  1  0  1
[7,]  0  0  1  1  1

Data

index <- rbind(c(1,2,3), c(1,2,5), c(1,3,4), c(1,3,5), c(1,4,5), c(2,3,5), c(3,4,5))

Upvotes: 0

Views: 197

Answers (4)

Maurits Evers
Maurits Evers

Reputation: 50668

Here is a solution in base R:

# Your sample matrix
m.idx <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5), ncol = 3);

# Construct empty matrix of the right dims
m.val <- matrix(0, nrow = nrow(m.idx), ncol = max(m.idx));

for (i in 1:nrow(m.idx)) m.val[i, m.idx[i, ]] <- 1;
m.val;
#     [,1] [,2] [,3] [,4] [,5]
#[1,]    1    1    1    0    0
#[2,]    1    1    0    0    1
#[3,]    1    0    1    1    0
#[4,]    1    0    1    0    1
#[5,]    1    0    0    1    1
#[6,]    0    1    1    0    1
#[7,]    0    0    1    1    1

Update

Please see below for benchmarking results of all methods presented here. I've wrapped all methods inside functions

# The original matrix with indices
m.idx <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5), ncol = 3);

# For loop method
method.for_loop <- function(m) {
    m.val <- matrix(0, nrow = nrow(m), ncol = max(m));
    for (i in 1:nrow(m.idx)) m.val[i, m.idx[i, ]] <- 1;
    return(m.val);
}

# lapply method (@Headpoint)
method.lapply <- function(m) {
    m.val <- as.data.frame(matrix(0, nrow = nrow(m), ncol = max(m)));
    invisible(lapply(1:nrow(m),
                   function(x) m.val[x,][m[x,]] <<- 1));
    return(m.val);
}

# Direct indexing method (@lmo)
method.indexing <- function(m) {
    m.val <- matrix(0L, nrow(m.idx), max(m.idx));
    m.val[cbind(c(row(m.idx)), c(m.idx))] <- 1L;
    return(m.val);
}

# tidyr/dplyr method (@CPak)
method.dplyr_tidyr <- function(m) {
    as.data.frame(m) %>%
      gather() %>%                   # wide-to-long format
      group_by(key) %>%
      mutate(rn = row_number()) %>%  # add unique row_id per `key` group
      mutate(newval = 1) %>%         # fill in `existing` with this value
      ungroup() %>%                  # ungroup and unselect `key` group
      select(-key) %>%
      spread(value, newval, fill=0) %>%  # long-to-wide format
                                         # fill in `non-existing` with `0`
      select(-rn) %>%                    # unselect row_id column
      rename_all(funs(paste0("a", .)))   # rename columns
}

The results from microbenchmark are given below.

library(microbenchmark);
library(tidyr);
library(dplyr);
library(magrittr);    

res <- microbenchmark(
    for_loop = method.for_loop(m.idx),
    lapply = method.lapply(m.idx),
    indexing = method.indexing(m.idx),
    dplyr_tidyr = method.dplyr_tidyr(m.idx),
    times = 1000L
)
print(res);
#    Unit: microseconds
#        expr       min         lq        mean    median        uq       max
#    for_loop     6.796     9.5405    16.89643    13.497    20.445    96.537
#      lapply  1315.765  1441.5990  1696.74392  1518.256  1675.027 66181.880
#    indexing     5.695     8.1450    20.49116    14.918    20.094  3139.946
# dplyr_tidyr 18777.669 20525.8095 22225.51936 21647.120 23215.714 84791.858

Conclusion: The methods using a for loop or direct indexing are tied and the fastest. lapply is second, the tidyr/dplyr method the slowest (but note the large increases in runtime).

Upvotes: 2

lmo
lmo

Reputation: 38500

Here is an extremely fast and efficient base R method mentioned in the comments using matrix indexing.

# construct 0 matrix with correct dimensions
newMat <- matrix(0L, nrow(myMat), max(myMat))

# fill in matrix using matrix indexing
newMat[cbind(c(row(myMat)), c(myMat))] <- 1L

This returns

newMat
     [,1] [,2] [,3] [,4] [,5]
[1,]    1    1    1    0    0
[2,]    1    1    0    0    1
[3,]    1    0    1    1    0
[4,]    1    0    1    0    1
[5,]    1    0    0    1    1
[6,]    0    1    1    0    1
[7,]    0    0    1    1    1

data

myMat <-
structure(c(1L, 1L, 1L, 1L, 1L, 2L, 3L, 2L, 2L, 3L, 3L, 4L, 3L, 
4L, 3L, 5L, 4L, 5L, 5L, 5L, 5L), .Dim = c(7L, 3L))

Upvotes: 5

kangaroo_cliff
kangaroo_cliff

Reputation: 6222

No for loop, but didn't really check if it is faster.

index <- matrix(c(1,1,1,1,1,2,3,2,2,3,3,4,3,4,3,5,4,5,5,5,5), 
                                   ncol = 3)
df <- as.data.frame(matrix(0, nrow = 7, ncol = 5))

invisible(lapply(1:nrow(index), 
                   function(x) df[x,][index[x,]] <<- 1))
df
# V1 V2 V3 V4 V5
# 1  1  1  1  0  0
# 2  1  1  0  0  1
# 3  1  0  1  1  0
# 4  1  0  1  0  1
# 5  1  0  0  1  1
# 6  0  1  1  0  1
# 7  0  0  1  1  1

Upvotes: 1

CPak
CPak

Reputation: 13581

You can do this with a combination of dplyr and tidyr

Your data

df <- read.table(text="1    2    3    
1    2    5    
1    3    4    
1    3    5    
1    4    5    
2    3    5    
3    4    5", header=FALSE)

Solution: some of these steps are to clean up the output

df %>%
  gather() %>%                   # wide-to-long format
  group_by(key) %>%              
  mutate(rn = row_number()) %>%  # add unique row_id per `key` group
  mutate(newval = 1) %>%         # fill in `existing` with this value
  ungroup() %>%                  # ungroup and unselect `key` group
  select(-key) %>%
  spread(value, newval, fill=0) %>%  # long-to-wide format
                                     # fill in `non-existing` with `0`
  select(-rn) %>%                    # unselect row_id column
  rename_all(funs(paste0("a", .)))   # rename columns

Output

# A tibble: 7 x 5
     a1    a2    a3    a4    a5
* <dbl> <dbl> <dbl> <dbl> <dbl>
1     1     1     1     0     0
2     1     1     0     0     1
3     1     0     1     1     0
4     1     0     1     0     1
5     1     0     0     1     1
6     0     1     1     0     1
7     0     0     1     1     1

Upvotes: 0

Related Questions