Reputation: 1327
Given is a data.table with a string column. The strings contain comma-separated values representing an arbitrary number of (x, y, z) points (so the number of comma-separated values is dividable by 3, e.g. '1,2,3,4,5,6' corresponds to two points (1, 2, 3), (4, 5, 6)). I want to convert these strings into a long table so that each row contains only one of these points. The former data.table should be extended and the other columns copied to the corresponding number of added rows.
I solved the task, but with a ugly combination of strsplit + matrix
iterating over individual rows with lapply(1:nrow(DT))
which is most probably very inefficient. I wonder if there is a more elegant solution. Also I run out of RAM using a 300k rows data.table.
library(data.table)
set.seed(1237)
N <- 5 # number of rows for test data
listlengths <- round(runif(N, 1, 5))*3 # length of row-wise comma separated lists of 3D-points
generateStrList <- function(n){
paste0(collapse = ",", round(runif(n, 0, 100)))
}
strlist <- lapply(listlengths, generateStrList)
# The follwoing data.table is given for the problem (read from a file with 'fread')
DT <- data.table(id = 1:N, b = round(runif(N, 0, 100)), c = strlist)
print(DT)
id b c
1: 1 10 80,96,40,83,86,12
2: 2 92 86,18,38,51,17,80,33,38,23,49,71,97,10,13,70
3: 3 76 84,39,86
4: 4 81 48,99,8
5: 5 56 53,92,27,2,39,62
# separate the points (x, y, z) encoded in string into a long table
separateList <- function(DT){
CommaSeparatedList <- DT$c
DT_new <- as.data.table(
matrix( # convert to matrix to get 3 columns
as.numeric( # convert to numerics
strsplit(unlist(CommaSeparatedList), split = ",")[[1]]), # split string at commas into string vector (instead of list)
ncol = 3, byrow = T)
)
setnames(DT_new, c("x", "y", "z"))
DT_new[ , id := DT$id] # add columns 'id' and 'b' from original data.table,
DT_new[ , b := DT$b] # they will have the same length as the listlength / 3
return(DT_new[])
}
# test for first item only
separateList(DT[1])
x y z id b
1: 80 96 40 1 10
2: 83 86 12 1 10
# apply on whole data set
DT_Long <- rbindlist(lapply(1:nrow(DT), function(x) separateList(DT[x])))
print(DT_Long)
x y z id b
1: 80 96 40 1 10 # in DT the rows 1 and 2 here were in the first row
2: 83 86 12 1 10
3: 86 18 38 2 92 # in DT row 2 contained 5 (x, y, z) points, so are extended to five rows here
4: 51 17 80 2 92 # 'id' and 'b' are copied to fill DT_Long
5: 33 38 23 2 92
6: 49 71 97 2 92
7: 10 13 70 2 92
8: 84 39 86 3 76
9: 48 99 8 4 81
10: 53 92 27 5 56
11: 2 39 62 5 56
The given solutions (slightly modified to match the results exactly)
foo_phann <- function(DT){
DT <- rbindlist(lapply(1:nrow(DT), function(x) separateList(DT[x])))
setkey(DT, id)
return(DT[])
}
foo_ronak <- function(DT){
DT <- as.data.table(DT %>%
separate_rows(c, sep = ',') %>%
group_by(grp = ceiling(row_number()/3)) %>%
mutate(cols = c('x', 'y', 'z')) %>%
pivot_wider(names_from = cols, values_from =c) %>%
ungroup %>%
select(-grp))[ , c("x", "y", "z", "id", "b")] # changed the column order to have identical results for benchmarking and the column type
DT[ , c("x", "y", "z") := lapply(.SD, as.numeric), .SDcols = c("x", "y", "z")]
setkey(DT, id)
return(DT[])
}
foo_zx <- function(DT){
DT <- DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed = TRUE))), by = id
][, rn1 := factor(seq_len(.N) %% 3,
levels = c(1,2,0), labels = c("x", "y", "z")), by = id
][, rn2 := seq_len(.N), by = .(id, rn1)
][ , dcast(.SD, id+b+rn2~rn1, value.var = "c")][ , c("x", "y", "z", "id", "b")]
# changed the column order and column type to match the results
DT[ , c("x", "y", "z", "b") := lapply(.SD, as.numeric), .SDcols = c("x", "y", "z", "b")]
return(DT[])
}
foo_a5 <- function(DT) {
# unlist the relevant column and use strsplit, but don't make your matrices yet
a <- strsplit(unlist(DT$c, use.names = FALSE), ",", TRUE)
# expand all the other columns of the input data.table...
DT <- cbind(DT[rep(seq.int(nrow(DT)), lengths(a)/3), 1:2],
# ... and bind it with your newly formed (single) matrix
matrix(as.integer(unlist(a, use.names=FALSE)),
ncol = 3, byrow = TRUE,
dimnames = list(NULL, c("x", "y", "z"))))
setcolorder(DT, c("x", "y", "z", "id", "b"))
setkey(DT, "id")
return(DT[])
}
give the following benchmarks for N=1000 and N=5000:
bench::mark(
Method1 = foo_phann(DT),
Method2 = foo_ronak(DT),
Method3 = foo_zx(DT),
Method4 = foo_a5(DT)
)
# N=1000
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 Method1 1.3s 1.3s 0.766 96.05MB 3.83 1 5 1.3s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
2 Method2 43.02ms 48.84ms 19.8 11.2MB 5.94 10 3 505ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
3 Method3 153.53ms 156.08ms 5.98 9.74MB 7.97 3 4 502ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
4 Method4 5.77ms 6.67ms 147. 417.88KB 1.98 74 1 505.1ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
#N = 5000
# A tibble: 4 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 Method1 6.98s 6.98s 0.143 481.2MB 5.59 1 39 6.98s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
2 Method2 194.08ms 198.01ms 3.81 55.5MB 6.35 3 5 787.93ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
3 Method3 1.43s 1.43s 0.699 199.6MB 16.1 1 23 1.43s <data.table~ <Rprofmem[~ <bch:t~ <tibble~
4 Method4 12.54ms 13.79ms 68.6 1.9MB 0 35 0 509.89ms <data.table~ <Rprofmem[~ <bch:t~ <tibble~
As expected my solution (Method1) is inefficient in comparison with the other two solutions. The dplyr solution (Method2) is faster and more memory efficient than the data.table approach (Method3) for a large number of rows. Unfortunatly, after about half a hour of calculating my original 300k rows data.table gives up with a memory error (using Method2). I guess I have to first split the data.table into multiple ones and process them independently. However, the given solutions are both nice improvements of my code!
Edit: The method foo_a5() of @A5C1D2H2I1M1N2O1R2T1 runs through my whole data seamlessly!
Out of pure curiosity I tested all four methods for a broad range of numbers:
Upvotes: 5
Views: 1039
Reputation: 56159
Using data.table, split delimited string into rows, create groups 1,2,3 using mod, the reshape long-to-wide using dcast:
DT[, lapply(.SD, function(x) unlist(tstrsplit(x, ",", fixed = TRUE))), by = id
][, rn1 := factor(.I %% 3, levels = c(1,2,0), labels = c("x", "y", "z")), by = id
][, rn2 := seq_len(.N), by = .(id, rn1)
][ , dcast(.SD, id+b+rn2~rn1, value.var = "c")]
# id b rn2 x y z
# 1: 1 10 1 80 96 40
# 2: 1 10 2 83 86 12
# 3: 2 92 1 86 18 38
# 4: 2 92 2 51 17 80
# 5: 2 92 3 33 38 23
# 6: 2 92 4 49 71 97
# 7: 2 92 5 10 13 70
# 8: 3 76 1 84 39 86
# 9: 4 81 1 48 99 8
# 10: 5 56 1 53 92 27
# 11: 5 56 2 2 39 62
Upvotes: 2
Reputation: 193517
strsplit
and matrix
are both fast, but you're not using them in an optimized manner. Here's the approach I'd suggest:
foo_a5 <- function(DT) {
# unlist the relevant column and use strsplit, but don't make your matrices yet
a <- strsplit(unlist(DT$c, use.names = FALSE), ",", TRUE)
# expand all the other columns of the input data.table...
cbind(DT[rep(seq.int(nrow(DT)), lengths(a)/3), 1:2],
# ... and bind it with your newly formed (single) matrix
matrix(as.integer(unlist(a, use.names=FALSE)),
ncol = 3, byrow = TRUE,
dimnames = list(NULL, c("x", "y", "z"))))
}
foo_a5(DT)
## id b x y z
## 1: 1 10 80 96 40
## 2: 1 10 83 86 12
## 3: 2 92 86 18 38
## 4: 2 92 51 17 80
## 5: 2 92 33 38 23
## 6: 2 92 49 71 97
## 7: 2 92 10 13 70
## 8: 3 76 84 39 86
## 9: 4 81 48 99 8
## 10: 5 56 53 92 27
## 11: 5 56 2 39 62
An alternative to @zx8754's answer that uses a similar logic is the following:
foo_zx2 <- function(DT) {
L <- DT[, list(c = unlist(strsplit(unlist(c, use.names = FALSE), ",", TRUE),
use.names = FALSE)), .(id, b)]
L[, time := rep(c("x", "y", "z"), length.out = nrow(L))][
, dcast(.SD, id + b + rowid(time) ~ time, value.var = "c")]
}
This tests faster than @Ronak's approach for me, but still slower than just using base R.
Upvotes: 1
Reputation: 388982
Using dplyr
and tidyr
you can split data on comma, create group of 3 rows and get the data in wide format.
library(dplyr)
library(tidyr)
DT %>%
separate_rows(c, sep = ',') %>%
group_by(grp = ceiling(row_number()/3)) %>%
mutate(cols = c('x', 'y', 'z')) %>%
pivot_wider(names_from = cols, values_from =c) %>%
ungroup %>%
select(-grp)
# id b x y z
# <int> <dbl> <chr> <chr> <chr>
# 1 1 10 80 96 40
# 2 1 10 83 86 12
# 3 2 92 86 18 38
# 4 2 92 51 17 80
# 5 2 92 33 38 23
# 6 2 92 49 71 97
# 7 2 92 10 13 70
# 8 3 76 84 39 86
# 9 4 81 48 99 8
#10 5 56 53 92 27
#11 5 56 2 39 62
Upvotes: 3