Reputation: 1327

Convert numbers in comma-separated string within a data.table column into a long table form

Given is a data.table with a string column. The strings contain comma-separated values representing an arbitrary number of (x, y, z) points (so the number of comma-separated values is dividable by 3, e.g. '1,2,3,4,5,6' corresponds to two points (1, 2, 3), (4, 5, 6)). I want to convert these strings into a long table so that each row contains only one of these points. The former data.table should be extended and the other columns copied to the corresponding number of added rows.

I solved the task, but with a ugly combination of strsplit + matrix iterating over individual rows with lapply(1:nrow(DT)) which is most probably very inefficient. I wonder if there is a more elegant solution. Also I run out of RAM using a 300k rows data.table.

Generate example data

library(data.table)
set.seed(1237)
N <- 5 # number of rows for test data
listlengths <- round(runif(N, 1, 5))*3 # length of row-wise comma separated lists of 3D-points

generateStrList <- function(n){
  paste0(collapse = ",", round(runif(n, 0, 100)))
}

strlist <- lapply(listlengths, generateStrList)

# The follwoing data.table is given for the problem (read from a file with 'fread')
DT <- data.table(id = 1:N, b = round(runif(N, 0, 100)), c = strlist)
print(DT)

   id  b                                            c
1:  1 10                            80,96,40,83,86,12
2:  2 92 86,18,38,51,17,80,33,38,23,49,71,97,10,13,70
3:  3 76                                     84,39,86
4:  4 81                                      48,99,8
5:  5 56                             53,92,27,2,39,62

Solve task (somewhat inefficiently)

# separate the points (x, y, z) encoded in string into a long table
separateList <- function(DT){
  CommaSeparatedList <- DT$c
  DT_new <- as.data.table(
    matrix( # convert to matrix to get 3 columns
      as.numeric( # convert to numerics
        strsplit(unlist(CommaSeparatedList), split = ",")[[1]]), # split string at commas into string vector (instead of list)
      ncol = 3, byrow = T)
    )
  setnames(DT_new, c("x", "y", "z"))
  DT_new[ , id := DT$id] # add columns 'id' and 'b' from original data.table, 
  DT_new[ , b := DT$b]     # they will have the same length as the listlength / 3
  return(DT_new[])
}

# test for first item only
separateList(DT[1])

    x  y  z id  b
1: 80 96 40  1 10
2: 83 86 12  1 10

#  apply on whole data set
DT_Long <- rbindlist(lapply(1:nrow(DT), function(x) separateList(DT[x])))
print(DT_Long)

     x  y  z id  b
 1: 80 96 40  1 10 # in DT the rows 1 and 2 here were in the first row
 2: 83 86 12  1 10
 3: 86 18 38  2 92 # in DT row 2 contained 5 (x, y, z) points, so are extended to five rows here
 4: 51 17 80  2 92 # 'id' and 'b' are copied to fill DT_Long
 5: 33 38 23  2 92
 6: 49 71 97  2 92
 7: 10 13 70  2 92
 8: 84 39 86  3 76
 9: 48 99  8  4 81
10: 53 92 27  5 56
11:  2 39 62  5 56

Edit: Benchmarking

The given solutions (slightly modified to match the results exactly)

foo_phann <- function(DT){
  DT <- rbindlist(lapply(1:nrow(DT), function(x) separateList(DT[x])))
  setkey(DT, id)
  return(DT[])
}

foo_ronak <- function(DT){
  DT <- as.data.table(DT %>%
    separate_rows(c, sep = ',') %>%
    group_by(grp = ceiling(row_number()/3)) %>%
    mutate(cols = c('x', 'y', 'z')) %>%
    pivot_wider(names_from = cols, values_from  =c) %>%
    ungroup %>%
    select(-grp))[ , c("x", "y", "z", "id", "b")] # changed the column order to have identical results for benchmarking and the column type
  DT[ , c("x", "y", "z") := lapply(.SD, as.numeric), .SDcols = c("x", "y", "z")]
  setkey(DT, id)
  return(DT[])
}

foo_zx <- function(DT){
  DT <- DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed = TRUE))), by = id 
     ][, rn1 := factor(seq_len(.N) %% 3, 
                       levels = c(1,2,0), labels = c("x", "y", "z")), by = id
       ][, rn2 := seq_len(.N), by = .(id, rn1)
         ][ , dcast(.SD, id+b+rn2~rn1, value.var = "c")][ , c("x", "y", "z", "id", "b")]
# changed the column order and column type to match the results
  DT[ , c("x", "y", "z", "b") := lapply(.SD, as.numeric), .SDcols = c("x", "y", "z", "b")]
  return(DT[])
}

foo_a5 <- function(DT) {
  # unlist the relevant column and use strsplit, but don't make your matrices yet
  a <- strsplit(unlist(DT$c, use.names = FALSE), ",", TRUE)
  # expand all the other columns of the input data.table...
  DT <- cbind(DT[rep(seq.int(nrow(DT)), lengths(a)/3), 1:2], 
        # ... and bind it with your newly formed (single) matrix
        matrix(as.integer(unlist(a, use.names=FALSE)),
               ncol = 3, byrow = TRUE, 
               dimnames = list(NULL, c("x", "y", "z"))))
  setcolorder(DT, c("x", "y", "z", "id", "b"))
  setkey(DT, "id")
  return(DT[])
}

give the following benchmarks for N=1000 and N=5000:

bench::mark(
  Method1 = foo_phann(DT),
  Method2 = foo_ronak(DT),
  Method3 = foo_zx(DT),
  Method4 = foo_a5(DT)
)

# N=1000
# A tibble: 4 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result       memory      time    gc      
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>       <list>      <list>  <list>  
1 Method1        1.3s     1.3s     0.766   96.05MB     3.83     1     5       1.3s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
2 Method2     43.02ms  48.84ms    19.8      11.2MB     5.94    10     3      505ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
3 Method3    153.53ms 156.08ms     5.98     9.74MB     7.97     3     4      502ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
4 Method4      5.77ms   6.67ms   147.     417.88KB     1.98    74     1    505.1ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~

#N = 5000
    # A tibble: 4 x 13
  expression      min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc total_time result       memory      time    gc      
  <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>   <bch:tm> <list>       <list>      <list>  <list>  
1 Method1       6.98s    6.98s     0.143   481.2MB     5.59     1    39      6.98s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
2 Method2    194.08ms 198.01ms     3.81     55.5MB     6.35     3     5   787.93ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
3 Method3       1.43s    1.43s     0.699   199.6MB    16.1      1    23      1.43s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
4 Method4     12.54ms  13.79ms    68.6       1.9MB     0       35     0   509.89ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~

As expected my solution (Method1) is inefficient in comparison with the other two solutions. The dplyr solution (Method2) is faster and more memory efficient than the data.table approach (Method3) for a large number of rows. Unfortunatly, after about half a hour of calculating my original 300k rows data.table gives up with a memory error (using Method2). I guess I have to first split the data.table into multiple ones and process them independently. However, the given solutions are both nice improvements of my code!

Edit: The method foo_a5() of @A5C1D2H2I1M1N2O1R2T1 runs through my whole data seamlessly!

Out of pure curiosity I tested all four methods for a broad range of numbers:

Upvotes: 5

Answers (3)

zx8754

Reputation: 56159

Using data.table, split delimited string into rows, create groups 1,2,3 using mod, the reshape long-to-wide using dcast:

DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed = TRUE))), by = id 
   ][, rn1 := factor(.I %% 3, levels = c(1,2,0), labels = c("x", "y", "z")), by = id
     ][, rn2 := seq_len(.N), by = .(id, rn1)
       ][ , dcast(.SD, id+b+rn2~rn1, value.var = "c")]

#     id  b rn2  x  y  z
#  1:  1 10   1 80 96 40
#  2:  1 10   2 83 86 12
#  3:  2 92   1 86 18 38
#  4:  2 92   2 51 17 80
#  5:  2 92   3 33 38 23
#  6:  2 92   4 49 71 97
#  7:  2 92   5 10 13 70
#  8:  3 76   1 84 39 86
#  9:  4 81   1 48 99  8
# 10:  5 56   1 53 92 27
# 11:  5 56   2  2 39 62

Upvotes: 2

A5C1D2H2I1M1N2O1R2T1

Reputation: 193517

strsplit and matrix are both fast, but you're not using them in an optimized manner. Here's the approach I'd suggest:

foo_a5 <- function(DT) {
  # unlist the relevant column and use strsplit, but don't make your matrices yet
  a <- strsplit(unlist(DT$c, use.names = FALSE), ",", TRUE)
  # expand all the other columns of the input data.table...
  cbind(DT[rep(seq.int(nrow(DT)), lengths(a)/3), 1:2], 
        # ... and bind it with your newly formed (single) matrix
        matrix(as.integer(unlist(a, use.names=FALSE)),
               ncol = 3, byrow = TRUE, 
               dimnames = list(NULL, c("x", "y", "z"))))
}

foo_a5(DT)
##     id  b  x  y  z
##  1:  1 10 80 96 40
##  2:  1 10 83 86 12
##  3:  2 92 86 18 38
##  4:  2 92 51 17 80
##  5:  2 92 33 38 23
##  6:  2 92 49 71 97
##  7:  2 92 10 13 70
##  8:  3 76 84 39 86
##  9:  4 81 48 99  8
## 10:  5 56 53 92 27
## 11:  5 56  2 39 62

An alternative to @zx8754's answer that uses a similar logic is the following:

foo_zx2 <- function(DT) {
  L <- DT[, list(c = unlist(strsplit(unlist(c, use.names = FALSE), ",", TRUE), 
                            use.names = FALSE)), .(id, b)]
  L[, time := rep(c("x", "y", "z"), length.out = nrow(L))][
    , dcast(.SD, id + b + rowid(time) ~ time, value.var = "c")]
}

This tests faster than @Ronak's approach for me, but still slower than just using base R.

Upvotes: 1

Ronak Shah

Reputation: 388982

Using dplyr and tidyr you can split data on comma, create group of 3 rows and get the data in wide format.

library(dplyr)
library(tidyr)

DT %>%
  separate_rows(c, sep = ',') %>%
  group_by(grp = ceiling(row_number()/3)) %>%
  mutate(cols = c('x', 'y', 'z')) %>%
  pivot_wider(names_from = cols, values_from  =c) %>%
  ungroup %>%
  select(-grp)

#      id     b x     y     z    
#   <int> <dbl> <chr> <chr> <chr>
# 1     1    10 80    96    40   
# 2     1    10 83    86    12   
# 3     2    92 86    18    38   
# 4     2    92 51    17    80   
# 5     2    92 33    38    23   
# 6     2    92 49    71    97   
# 7     2    92 10    13    70   
# 8     3    76 84    39    86   
# 9     4    81 48    99    8    
#10     5    56 53    92    27   
#11     5    56 2     39    62

Upvotes: 3

Convert numbers in comma-separated string within a data.table column into a long table form

Generate example data

Solve task (somewhat inefficiently)

Edit: Benchmarking

Answers (3)

Related Questions