user1658170
user1658170

Reputation: 858

How to split the columns of a data frame and reshape it?

I am trying to restructure data so that a column containing multiple values is spread to match the desired output I have listed below? I made several attempts with tidyr::spread() and tidyr::gather() to no avail. Any ideas?

dat <- data.frame("name" = c("a", "b", "c"), 
                  "count" = c("2003=22; 2004=32", 
                              "2003=34; 2005=45", 
                              "2005=32; 2006=67"))


name            count
a               2003=22; 2004=32
b               2003=34; 2005=45
c               2005=32; 2006=67

Desired output:

name    2003    2004    2005    2006    
a        22     32      NA      NA
b        34     NA      45      NA      
c        NA     NA      32      67

Upvotes: 1

Views: 320

Answers (5)

moodymudskipper
moodymudskipper

Reputation: 47350

You could use separate_rows and separate from tidyr.

library(tidyr)
dat %>% 
  separate_rows(count, sep = "; ") %>%
  separate(count, sep = "=", into = c("key","val")) %>% 
  spread(key, val)
#   name 2003 2004 2005 2006
# 1    a   22   32 <NA> <NA>
# 2    b   34 <NA>   45 <NA>
# 3    c <NA> <NA>   32   67

Another solution would have been parse these strings into a tibble and unnest the result :

library(tidyverse)
dat %>%
  mutate(count = map(count, ~  gsub("; ",", Y",.) %>%
                       paste0("tibble(Y",.,")") %>%
                       {eval(parse(text= .))})) %>%
  unnest %>%
  rename_at(-1,~str_sub(.,2))
#   name 2003 2004 2005 2006
# 1    a   22   32   NA   NA
# 2    b   34   NA   45   NA
# 3    c   NA   NA   32   67

Upvotes: 2

jay.sf
jay.sf

Reputation: 73712

If you want a base R approach without packages based on strsplit() and reshape().

dat[] <- lapply(dat, as.character)  # transform columns to characters

# split the columns
DF <- data.frame(cbind(rep(dat$name, each=2), 
                       matrix(unlist(
                         lapply(strsplit(dat$count, "; "), strsplit, "=")), 
                         6, byrow=TRUE)))

# reshape into wide format
DF <- reshape(DF, timevar="X2", idvar="X1", direction="wide")

# coerce year values into numeric form
DF[, -1] <- lapply(DF[, -1], function(x) as.numeric(as.character(x)))

# desired column names (optional)
names(DF) <- c(names(dat)[1], sub("X[^09+].", "", names(DF)[-1]))

> DF
  name 2003 2004 2005 2006
1    a   22   32   NA   NA
3    b   34   NA   45   NA
5    c   NA   NA   32   67

Upvotes: 0

Shree
Shree

Reputation: 11150

Here is a way using extract + bind_rows + spread -

dat %>%
  extract(count, c("year1", "value1", "year2", "value2"),
          regex = "([:digit:]+)=([:digit:]+);.([:digit:]+)=([:digit:]+)") %>% 
  {bind_rows(
    select(., name, year = year1, value = value1),
    select(., name, year = year2, value = value2)
  )} %>% 
  spread(year, value)

  name 2003 2004 2005 2006
1    a   22   32 <NA> <NA>
2    b   34 <NA>   45 <NA>
3    c <NA> <NA>   32   67

Here's another way using extract + spread + spread. This may look less verbose but I feel above method is more reliable as the first spread here could potentially fail for some specific cases.

dat %>%
  extract(count, c("year1", "value1", "year2", "value2"),
          regex = "([:digit:]+)=([:digit:]+);.([:digit:]+)=([:digit:]+)") %>% 
  spread(year1, value1) %>%
  spread(year2, value2)

  name 2003 2004 2005 2006
1    a   22   32 <NA> <NA>
2    b   34 <NA>   45 <NA>
3    c <NA> <NA> <NA>   67

Upvotes: 0

neilfws
neilfws

Reputation: 33802

There's probably a smarter, more concise way, but this works:

library(tidyr)
dat %>% 
  separate(count, sep = "; ", into = c("c1", "c2")) %>% 
  gather(Var, Val, -name) %>% 
  separate(Val, sep = "=", into = c("year", "value")) %>% 
  select(-Var) %>% 
  spread(year, value)

  name 2003 2004 2005 2006
1    a   22   32 <NA> <NA>
2    b   34 <NA>   45 <NA>
3    c <NA> <NA>   32   67

Note that this results in "wide" data; the "long" data before the spread might be easier to work with.

Upvotes: 3

AidanGawronski
AidanGawronski

Reputation: 2085

Here is a solution with base R and reshape2:

ting1 <- data.frame(get1 = gsub("(.*);(.*)", "\\1", dat$count))
ting1 <- cbind(name = dat$name, ting1)

ting2 <- data.frame(get1 = gsub("(.*);(.*)", "\\2", dat$count))
ting2 <- cbind(name = dat$name, ting2)

df <- rbind(ting1, ting2)

df$years <- trimws(gsub("(.*)=(.*)", "\\1", df$get1))
df$values <- gsub("(.*)=(.*)", "\\2", df$get1)

library(reshape2)

outdf <- dcast(df, name ~ years, value.var = "values")
outdf
# name 2003 2004 2005 2006
# 1    a   22   32 <NA> <NA>
# 2    b   34 <NA>   45 <NA>
# 3    c <NA> <NA>   32   67

Upvotes: 1

Related Questions