Luker354
Luker354

Reputation: 669

Split multiple strings in multiple columns in a matrix in R

I have following matrix with numbers of 0 and 1 with always the same number of strings per column, but also containing columns with only one string. I would like to to split each number into separate columns, that only one number per column and row occurs. But I would like to leave the columns with only one string as it is:

r1 <- c("0","001","0001","01","100")
r2 <- c("1","001","0001","10","100")
r3 <- c("0","100","1000","10","010")
r4 <- c("0","010","0100","10","001")
r5<- c("0","010","0010","10","001")

n.mat <- rbind(r1,r2,r3,r4,r5)

The output:

r1 <- c("0","0","0","1","0","0","0","1","0","1","1","0","0")
r2 <- c("1","0","0","1","0","0","0","1","1","0","1","0","0")
r3 <- c("0","1","0","0","1","0","0","0","1","0","0","1","0")
r4 <- c("0","0","1","0","0","1","0","0","1","0","0","0","1")
r5 <- c("0","0","1","0","0","0","1","0","1","0","0","0","1")

n.mat_new <- rbind(r1,r2,r3,r4,r5)

My code, but it crashes, because of the columns with only one string:

n.mat <- do.call(cbind, apply(n.mat, 2, function(x) {
  tmp <-strsplit(x, '')
  t(sapply(tmp, `[`, 1:max(lengths(tmp))))
}))

Upvotes: 2

Views: 406

Answers (4)

A5C1D2H2I1M1N2O1R2T1
A5C1D2H2I1M1N2O1R2T1

Reputation: 193517

There's no need for apply or paste for this specific problem. Simply transpose the matrix, split all the strings, and re-construct the matrix according to the number of rows in the original matrix.

matrix(unlist(strsplit(t(n.mat), "")), nrow = nrow(n.mat), byrow = TRUE)
#      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
# [1,] "0"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"   "1"   "0"   "0"  
# [2,] "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "1"  "0"   "1"   "0"   "0"  
# [3,] "0"  "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"   "0"   "1"   "0"  
# [4,] "0"  "0"  "1"  "0"  "0"  "1"  "0"  "0"  "1"  "0"   "0"   "0"   "1"  
# [5,] "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"  "0"   "0"   "0"   "1"  

If you want further optimizations, you can do something like the following, which will retain the rownames

matrix(unlist(strsplit(t(n.mat), "", TRUE), use.names = FALSE), 
       nrow = nrow(n.mat), byrow = TRUE, 
       dimnames = list(rownames(n.mat), NULL))
#    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
# r1 "0"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"   "1"   "0"   "0"  
# r2 "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "1"  "0"   "1"   "0"   "0"  
# r3 "0"  "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"   "0"   "1"   "0"  
# r4 "0"  "0"  "1"  "0"  "0"  "1"  "0"  "0"  "1"  "0"   "0"   "0"   "1"  
# r5 "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"  "0"   "0"   "0"   "1" 

By avoiding apply, you're only calling strsplit once, so you're going to notice much better performance if you have a lot of rows to process.

On my Chromebook (so these times are likely to be slow to begin with) testing with 10,000 rows, I get the following:

nrow(n.mat)
# [1] 10000

bench::mark(am_opt(), am(), gki(), jay(), check = FALSE)
# # A tibble: 4 x 13
#   expression     min  median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time
#   <bch:expr> <bch:t> <bch:t>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm>
# 1 am_opt()    28.3ms  40.1ms     27.4     2.75MB     0       14     0      511ms
# 2 am()        36.1ms  41.2ms     24.6     2.75MB     0       13     0      528ms
# 3 gki()      220.3ms 229.4ms      4.39    3.43MB     0        3     0      683ms
# 4 jay()      975.8ms 975.8ms      1.02    3.51MB     1.02     1     1      976ms
# # … with 4 more variables: result <list>, memory <list>, time <list>, gc <list>

I didn't benchmark Karthik's answer because just running it once took more than 1 minute.

system.time(karthik())
#    user  system elapsed 
#  81.341   0.000  81.343 

Where the functions are directly copied from the other answers:

am_opt <- function() {
  matrix(unlist(strsplit(t(n.mat), "", TRUE), use.names = FALSE), 
         nrow = nrow(n.mat), byrow = TRUE, 
         dimnames = list(rownames(n.mat), NULL))
} 
am <- function() matrix(unlist(strsplit(t(n.mat), "")), nrow = nrow(n.mat), byrow = TRUE)
gki <- function() matrix(unlist(apply(n.mat, 1, strsplit, split = "")), nrow(n.mat), byrow=TRUE)
jay <- function() t(apply(n.mat, 1, function(x) el(strsplit(Reduce(paste0, x), ""))))
karthik <- function() bind_rows(apply(n.mat, 2, strsplit, split = '')) %>% t

Upvotes: 1

GKi
GKi

Reputation: 39657

You can use strsplit in apply, unlist the result and create with this a matrix.

matrix(unlist(apply(n.mat, 1, strsplit, split = "")), nrow(n.mat), byrow=TRUE)
#     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
#[1,] "0"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"   "1"   "0"   "0"  
#[2,] "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "1"  "0"   "1"   "0"   "0"  
#[3,] "0"  "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"   "0"   "1"   "0"  
#[4,] "0"  "0"  "1"  "0"  "0"  "1"  "0"  "0"  "1"  "0"   "0"   "0"   "1"  
#[5,] "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"  "0"   "0"   "0"   "1"  

Upvotes: 1

Karthik S
Karthik S

Reputation: 11584

Does this work:

library(dplyr)
bind_rows(apply(n.mat, 2, strsplit, split = '')) %>% t
   [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
r1 "0"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"   "1"   "0"   "0"  
r2 "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "1"  "0"   "1"   "0"   "0"  
r3 "0"  "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"   "0"   "1"   "0"  
r4 "0"  "0"  "1"  "0"  "0"  "1"  "0"  "0"  "1"  "0"   "0"   "0"   "1"  
r5 "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"  "0"   "0"   "0"   "1"  

Upvotes: 1

jay.sf
jay.sf

Reputation: 72683

Collapse paste0 using Reduce and use strsplit on "".

t(apply(n.mat, 1, function(x) el(strsplit(Reduce(paste0, x), ""))))
#    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13]
# r1 "0"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"   "1"   "0"   "0"  
# r2 "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "1"  "0"   "1"   "0"   "0"  
# r3 "0"  "1"  "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"   "0"   "1"   "0"  
# r4 "0"  "0"  "1"  "0"  "0"  "1"  "0"  "0"  "1"  "0"   "0"   "0"   "1"  
# r5 "0"  "0"  "1"  "0"  "0"  "0"  "1"  "0"  "1"  "0"   "0"   "0"   "1" 

Upvotes: 1

Related Questions