Reputation: 814
I have some data that looks like:
ID Data
1 a b c
2 a b c a b c
3 a b c
4 a b c a b c a b c a b c
5 a b c a b c a b c
I would like it in the following
ID Data
1 a b c
2 a b c
2 a b c
3 a b c
4 a b c
4 a b c
4 a b c
4 a b c
5 a b c
5 a b c
5 a b c
a
, b
and c
are in different column cells so Data
is actually many columns. I can concatenate them if need be.
Essentially a
b
and c
are related but for some IDs I have multiple results and I'd like the data in a long format rather than wide but keeping the ID for each row.
I can do in this is R if that's easier too.
Dput:
structure(list(ID = c(9999812L, 999908L, 9993595L, 9992905L,
9989664L, 9984487L, 9980956L, 9980112L, 9980091L, 9979915L, 9979613L,
9979400L, 9978215L, 9976882L, 9975335L, 9974511L, 9973804L, 9973025L
), a = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "a", class = "factor"),
b = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "b", class = "factor"),
c = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "c", class = "factor"),
a.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "a"), class = "factor"),
b.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "b"), class = "factor"),
c.1 = structure(c(2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L, 2L), .Label = c("", "c"), class = "factor"),
a.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"),
b.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"),
c.2 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor"),
a.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"),
b.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"),
c.3 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor"),
a.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "a"), class = "factor"),
b.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "b"), class = "factor"),
c.4 = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("", "c"), class = "factor")), .Names = c("ID",
"a", "b", "c", "a.1", "b.1", "c.1", "a.2", "b.2", "c.2", "a.3",
"b.3", "c.3", "a.4", "b.4", "c.4"), class = "data.frame", row.names = c(NA,
-18L))
Upvotes: 0
Views: 152
Reputation: 11128
With your initial data, you can use "stringr" and "reshape2" to melt the data.
dt <- data.frame(x = 1:5, y = c( "a b c" , "a b c a b c","a b c","a b c a b c a b c a b c", "a b c a b c a b c"))
library("stringr")
library("reshape2")
maxlen <- max(lengths(str_extract_all(dt$y,"(\\w)\\s(\\w)\\s(\\w)(\\1\\s\\2\\s\\3)*")))
list_lists <- str_extract_all(dt$y,"(\\w)\\s(\\w)\\s(\\w)(\\1\\s\\2\\s\\3)*")
li <- lapply(list_lists, `length<-`,maxlen)
dtnew <- data.frame(x =cbind(dt$x),do.call("rbind",li))
dtnew1 <- melt(dtnew,id.vars="x")
dtnew1 <- dtnew1[!is.na(dtnew1$value),]
dtnew1[order(dtnew1$x),]
> dtnew1[order(dtnew1$x),c(1,3)]
x value
1 1 a b c
2 2 a b c
7 2 a b c
3 3 a b c
4 4 a b c
9 4 a b c
14 4 a b c
19 4 a b c
5 5 a b c
10 5 a b c
15 5 a b c
>
EDIT: For the updated data, make a field called "concat
", which is concatenated value of columns "a" to "c.4"
You can use : concat <- data.frame(concat=do.call("paste0",dt[,2:length(dt)]))
to concatenate fields
then assign dt$concat <- concat
library("stringr")
library("reshape2")
maxlen <- max(lengths(str_extract_all(dt$concat,"(\\w)(\\w)(\\w)")))
list_lists <- str_extract_all(dt$concat,"(\\w)(\\w)(\\w)")
li <- lapply(list_lists, `length<-`,maxlen)
dtnew <- data.frame(x =cbind(dt$ID),y=do.call("rbind",li))
dtnew1 <- melt(dtnew,id.vars="x")
dtnew1 <- dtnew1[!is.na(dtnew1$value),]
dtnew1[order(dtnew1$x),c(1,3)]
> dtnew1[order(dtnew1$x),c(1,3)]
x value
2 999908 abc
18 9973025 abc
36 9973025 abc
54 9973025 abc
72 9973025 abc
90 9973025 abc
17 9973804 abc
35 9973804 abc
16 9974511 abc
15 9975335 abc
33 9975335 abc
51 9975335 abc
69 9975335 abc
14 9976882 abc
13 9978215 abc
12 9979400 abc
30 9979400 abc
48 9979400 abc
11 9979613 abc
10 9979915 abc
9 9980091 abc
8 9980112 abc
7 9980956 abc
6 9984487 abc
24 9984487 abc
5 9989664 abc
4 9992905 abc
3 9993595 abc
1 9999812 abc
19 9999812 abc
>
Upvotes: 1