Crops
Crops

Reputation: 5162

data.table - split multiple columns

I have a data.table G as follows.

d <- list( c("SD1:LUSH", "SD44:CANCEL", "SD384:FR563", "SD32:TRUMPET"), c("SD23:SWITCH", "SD1:LUSH", "SD567:TREK"), c("SD42:CRAYON", "SD345:FOX", "SD183:WIRE"), c("SD345:HOLE", "SD340:DUST", "SD387:ROLL"), c("SD455:TOMATO", "SD86:RAY", "SD39:MATURE", "SD86:COSMIC"), c("SD12:PAINTING", "SD315:MONEY31", "SD387:SPRING"),  c("SD32:TRUMPET", "SD1:FIELD", "SD40:GREEN", "SD40:PARK"))

d2 <-  lapply(d, function(x) sapply(strsplit(x, ":"), "[", 1))
d <- lapply(d, function(x) paste0(unique(x), collapse=", "))
d2 <- lapply(d2, function(x) paste0(unique(x), collapse=", "))
d <- as.data.frame(as.matrix(lapply(d, paste0, collapse=", ")))
d2 <- as.data.frame(as.matrix(lapply(d2, paste0, collapse=", ")))
d <- as.data.frame(cbind(d,d2))
colnames(d) <- c("sdw", "sd")
d$sd <- as.character(d$sd)
d$sdw <- as.character(d$sdw)

G <- data.table( d , key = "sd" )
G[, GNO:= seq(1,nrow(G))]
setcolorder(G, neworder=c("GNO", "sd", "sdw"))

G
    GNO                     sd                                              sdw
1:   1 SD1, SD44, SD384, SD32 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET
2:   2     SD12, SD315, SD387       SD12:PAINTING, SD315:MONEY31, SD387:SPRING
3:   3       SD23, SD1, SD567                SD23:SWITCH, SD1:LUSH, SD567:TREK
4:   4        SD32, SD1, SD40   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK
5:   5    SD345, SD340, SD387               SD345:HOLE, SD340:DUST, SD387:ROLL
6:   6     SD42, SD345, SD183               SD42:CRAYON, SD345:FOX, SD183:WIRE
7:   7      SD455, SD86, SD39 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC

I am trying to split each set of elements in column sd and get a modified data.table according to column GNO as follows.

G2 <- G[, list(sd = unlist(strsplit( sd , "," ))) , by = list(GNO, sdw)]

G2
   GNO                                              sdw     sd
 1:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET    SD1
 2:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET   SD44
 3:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET  SD384
 4:   1 SD1:LUSH, SD44:CANCEL, SD384:FR563, SD32:TRUMPET   SD32
 5:   2       SD12:PAINTING, SD315:MONEY31, SD387:SPRING   SD12
 6:   2       SD12:PAINTING, SD315:MONEY31, SD387:SPRING  SD315
 7:   2       SD12:PAINTING, SD315:MONEY31, SD387:SPRING  SD387
 8:   3                SD23:SWITCH, SD1:LUSH, SD567:TREK   SD23
 9:   3                SD23:SWITCH, SD1:LUSH, SD567:TREK    SD1
10:   3                SD23:SWITCH, SD1:LUSH, SD567:TREK  SD567
11:   4   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK   SD32
12:   4   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK    SD1
13:   4   SD32:TRUMPET, SD1:FIELD, SD40:GREEN, SD40:PARK   SD40
14:   5               SD345:HOLE, SD340:DUST, SD387:ROLL  SD345
15:   5               SD345:HOLE, SD340:DUST, SD387:ROLL  SD340
16:   5               SD345:HOLE, SD340:DUST, SD387:ROLL  SD387
17:   6               SD42:CRAYON, SD345:FOX, SD183:WIRE   SD42
18:   6               SD42:CRAYON, SD345:FOX, SD183:WIRE  SD345
19:   6               SD42:CRAYON, SD345:FOX, SD183:WIRE  SD183
20:   7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC  SD455
21:   7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC   SD86
22:   7 SD455:TOMATO, SD86:RAY, SD39:MATURE, SD86:COSMIC   SD39

I would also like to split the elements in sdw. But when I am trying that, it all gets messed up due to the fact that number of elements in sd and sdw are not necessarily the same.

t <- G[ , list(sd = unlist(strsplit(sd, "," )),
               sdw = unlist(strsplit(sdw, "," ))) , by = list(GNO) ]

It give the warning

Warning message:
In `[.data.table`(G, , list(sd = unlist(strsplit(sd, ",")), sdw = unlist(strsplit(sdw,  :
  Column 1 of result for group 4 is length 3 but the longest column in this result is 4. Recycled leaving remainder of 1 items. This warning is once only for the first group with this issue.

The output I am looking for is as follows.

out <- structure(list(GNO = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 
5, 5, 5, 6, 6, 6, 7, 7, 7), sd = structure(c(1L, 14L, 9L, 6L, 
2L, 5L, 10L, 4L, 1L, 16L, 6L, 1L, 12L, 8L, 7L, 10L, 13L, 8L, 
3L, 15L, 17L, 11L), .Label = c("SD1", "SD12", "SD183", "SD23", 
"SD315", "SD32", "SD340", "SD345", "SD384", "SD387", "SD39", 
"SD40", "SD42", "SD44", "SD455", "SD567", "SD86"), class = "factor"), 
    sdw = structure(c(2L, 17L, 11L, 7L, 3L, 6L, 13L, 5L, 2L, 
    19L, 7L, 1L, 15L, 10L, 8L, 12L, 16L, 9L, 4L, 18L, 20L, 14L
    ), .Label = c("SD1:FIELD", "SD1:LUSH", "SD12:PAINTING", "SD183:WIRE", 
    "SD23:SWITCH", "SD315:MONEY31", "SD32:TRUMPET", "SD340:DUST", 
    "SD345:FOX", "SD345:HOLE", "SD384:FR563", "SD387:ROLL", "SD387:SPRING", 
    "SD39:MATURE", "SD40:GREEN, SD40:PARK", "SD42:CRAYON", "SD44:CANCEL", 
    "SD455:TOMATO", "SD567:TREK", "SD86:COSMIC, SD86:RAY"), class = "factor")), .Names = c("GNO", 
"sd", "sdw"), row.names = c(NA, -22L), class = "data.frame")

out
   GNO    sd                   sdw
1    1   SD1              SD1:LUSH
2    1  SD44           SD44:CANCEL
3    1 SD384           SD384:FR563
4    1  SD32          SD32:TRUMPET
5    2  SD12         SD12:PAINTING
6    2 SD315         SD315:MONEY31
7    2 SD387          SD387:SPRING
8    3  SD23           SD23:SWITCH
9    3   SD1              SD1:LUSH
10   3 SD567            SD567:TREK
11   4  SD32          SD32:TRUMPET
12   4   SD1             SD1:FIELD
13   4  SD40 SD40:GREEN, SD40:PARK
14   5 SD345            SD345:HOLE
15   5 SD340            SD340:DUST
16   5 SD387            SD387:ROLL
17   6  SD42           SD42:CRAYON
18   6 SD345             SD345:FOX
19   6 SD183            SD183:WIRE
20   7 SD455          SD455:TOMATO
21   7  SD86 SD86:COSMIC, SD86:RAY
22   7  SD39           SD39:MATURE

How to get this output?

Upvotes: 3

Views: 237

Answers (2)

A5C1D2H2I1M1N2O1R2T1
A5C1D2H2I1M1N2O1R2T1

Reputation: 193507

This may be making too big of an assumption about the nature of your data, but it appears that the "sd" column is actually somewhat irrelevant since that information is already embedded in the "sdw" column.

As such, this is what I came up with, using two cSplits from my "splitstackshape" package:

library(splitstackshape)
temp <- cSplit(
  cSplit(
    G, "sdw", ",", "long"), 
  "sdw", ":")[, c(1, 3, 4)
              ][, list(paste(unique(sdw_1), unique(sdw_2), 
                             sep = ":", collapse = ", ")), 
                by = list(GNO, sdw_1)]
setnames(temp, c("GNO", "sd", "sdw"))
temp
#     GNO    sd                   sdw
#  1:   1   SD1              SD1:LUSH
#  2:   1  SD44           SD44:CANCEL
#  3:   1 SD384           SD384:FR563
#  4:   1  SD32          SD32:TRUMPET
#  5:   2  SD12         SD12:PAINTING
#  6:   2 SD315         SD315:MONEY31
#  7:   2 SD387          SD387:SPRING
#  8:   3  SD23           SD23:SWITCH
#  9:   3   SD1              SD1:LUSH
# 10:   3 SD567            SD567:TREK
# 11:   4  SD32          SD32:TRUMPET
# 12:   4   SD1             SD1:FIELD
# 13:   4  SD40 SD40:GREEN, SD40:PARK
# 14:   5 SD345            SD345:HOLE
# 15:   5 SD340            SD340:DUST
# 16:   5 SD387            SD387:ROLL
# 17:   6  SD42           SD42:CRAYON
# 18:   6 SD345             SD345:FOX
# 19:   6 SD183            SD183:WIRE
# 20:   7 SD455          SD455:TOMATO
# 21:   7  SD86 SD86:RAY, SD86:COSMIC
# 22:   7  SD39           SD39:MATURE
#     GNO    sd                   sdw

Comparison with your desired output

library(compare)
compare(out, temp, allowAll = TRUE)
# TRUE 
#   [GNO] coerced from <integer> to <numeric>
#   [sdw] coerced from <character> to <factor>
#   dropped attributes
#   [3] dropped attributes

Upvotes: 2

David Arenburg
David Arenburg

Reputation: 92282

Here's how I would approach this using only data.table package (no need in creating G2 neither)

Here we are first splitting by swd and then aggregating by both GNO and the part before : in the stw column

G[, .(unlist(strsplit(sdw, ", ", TRUE))), GNO][, 
    .(stw = toString(V1)), .(GNO, sd = gsub(":.*", "", V1))]

 #    GNO    sd                   stw
 # 1:   1   SD1              SD1:LUSH
 # 2:   1  SD44           SD44:CANCEL
 # 3:   1 SD384           SD384:FR563
 # 4:   1  SD32          SD32:TRUMPET
 # 5:   2  SD12         SD12:PAINTING
 # 6:   2 SD315         SD315:MONEY31
 # 7:   2 SD387          SD387:SPRING
 # 8:   3  SD23           SD23:SWITCH
 # 9:   3   SD1              SD1:LUSH
# 10:   3 SD567            SD567:TREK
# 11:   4  SD32          SD32:TRUMPET
# 12:   4   SD1             SD1:FIELD
# 13:   4  SD40 SD40:GREEN, SD40:PARK
# 14:   5 SD345            SD345:HOLE
# 15:   5 SD340            SD340:DUST
# 16:   5 SD387            SD387:ROLL
# 17:   6  SD42           SD42:CRAYON
# 18:   6 SD345             SD345:FOX
# 19:   6 SD183            SD183:WIRE
# 20:   7 SD455          SD455:TOMATO
# 21:   7  SD86 SD86:RAY, SD86:COSMIC
# 22:   7  SD39           SD39:MATURE

Upvotes: 6

Related Questions