user3664020
user3664020

Reputation: 3020

Reshape data table

I have a data table like (data is not necessarily ordered by 'col1')

    col0    col1      col2
1:  abc       1         a
2:  abc       2         b 
3:  abc       3         c 
4:  abc       4         d 
5:  abc       5         e
6:  def       1         a
7:  def       2         b 
8:  def       3         c 
9:  def       4         d 
10: def       5         e

I want to reshape it the following way

    col0      col1      col2      new_1   new_2   new_3   new_4
1:  abc         1         a         NA      NA       NA      NA
2:  abc         2         b         a       NA       NA      NA
3:  abc         3         c         b       a        NA      NA
4:  abc         4         d         c       b        a       NA 
5:  abc         5         e         d       c        b       a
6:  def         1         a         NA      NA       NA      NA
7:  def         2         b         a       NA       NA      NA
8:  def         3         c         b       a        NA      NA
9:  def         4         d         c       b        a       NA 
10: def         5         e         d       c        b       a

Basically I want to get previously occurred values of col2 for each row in the same row as above and if there is none the corresponding new column should say NA.

I can of course do it by merge on col2 5 times but I need to do this on a large table (in that case I will have to merge 20-30 times).

What is the best way to achieve it in R in 1 or 2 lines?

Upvotes: 1

Views: 105

Answers (2)

Alan Gómez
Alan Gómez

Reputation: 378

Data

df <- data.frame(col0 = rep(c("abc", "def"), each=5), col1 = rep(1:5, 2), col2 = rep(letters[1:5],2))

Procedure

m <- matrix(ncol=max(df[,2]), nrow=dim(df)[1])
#defining a matrix of final shaped dimensions 

for (i in 1:10) {
    m[i,1:df[i,2]] <- (c(df[i:(i-df[i,2]+1),3]))
}
#replacements

df2 <- cbind(df, m)
#final combination

colnames(df2)[4:ncol(df2)] <- paste("new_", 1:ncol(m))
#column's renaming

OUTPUTS

> df
   col0 col1 col2
1   abc    1    a
2   abc    2    b
3   abc    3    c
4   abc    4    d
5   abc    5    e
6   def    1    a
7   def    2    b
8   def    3    c
9   def    4    d
10  def    5    e
> df2
   col0 col1 col2 new_ 1 new_ 2 new_ 3 new_ 4 new_ 5
1   abc    1    a      a   <NA>   <NA>   <NA>   <NA>
2   abc    2    b      b      a   <NA>   <NA>   <NA>
3   abc    3    c      c      b      a   <NA>   <NA>
4   abc    4    d      d      c      b      a   <NA>
5   abc    5    e      e      d      c      b      a
6   def    1    a      a   <NA>   <NA>   <NA>   <NA>
7   def    2    b      b      a   <NA>   <NA>   <NA>
8   def    3    c      c      b      a   <NA>   <NA>
9   def    4    d      d      c      b      a   <NA>
10  def    5    e      e      d      c      b      a

Upvotes: 0

akrun
akrun

Reputation: 887118

We can use shift from the devel version of data.table i.e. v1.9.5 (Instructions to install the devel version are here. By default, the type in shift is lag. We can specify n as a vector, in this case 1:4. We assign (:=) the output to new columns.

library(data.table)#v1.9.5+
DT[, paste('new', 1:4, sep="_") := shift(col2, 1:4)]
DT
#   col1 col2 new_1 new_2 new_3 new_4
#1:    1    a    NA    NA    NA    NA
#2:    2    b     a    NA    NA    NA
#3:    3    c     b     a    NA    NA
#4:    4    d     c     b     a    NA
#5:    5    e     d     c     b     a

For the new dataset 'DT2', we need to group by 'col0' and then do the shift on 'col2'

DT2[, paste('new', 1:4, sep="_") := shift(col2, 1:4), by = col0]
DT2
#   col0 col1 col2 new_1 new_2 new_3 new_4
# 1:  abc    1    a    NA    NA    NA    NA
# 2:  abc    2    b     a    NA    NA    NA
# 3:  abc    3    c     b     a    NA    NA
# 4:  abc    4    d     c     b     a    NA
# 5:  abc    5    e     d     c     b     a
# 6:  def    1    a    NA    NA    NA    NA
# 7:  def    2    b     a    NA    NA    NA
# 8:  def    3    c     b     a    NA    NA
# 9:  def    4    d     c     b     a    NA
#10:  def    5    e     d     c     b     a

data

df1 <- structure(list(col1 = 1:5, col2 = c("a", "b", "c", "d", "e"), 
new_1 = c(NA, "a", "b", "c", "d"), new_2 = c(NA, NA, "a", 
"b", "c"), new_3 = c(NA, NA, NA, "a", "b"), new_4 = c(NA, 
NA, NA, NA, "a")), .Names = c("col1", "col2", "new_1", "new_2", 
"new_3", "new_4"), class = "data.frame", row.names = c(NA, -5L
))

DT <- as.data.table(df1)

df2 <- structure(list(col0 = c("abc", "abc", "abc", "abc", "abc", 
"def", 
"def", "def", "def", "def"), col1 = c(1L, 2L, 3L, 4L, 5L, 1L, 
2L, 3L, 4L, 5L), col2 = c("a", "b", "c", "d", "e", "a", "b", 
 "c", "d", "e")), .Names = c("col0", "col1", "col2"), 
class = "data.frame", row.names = c(NA, -10L))
DT2 <- as.data.table(df2)

Upvotes: 3

Related Questions