Jeni
Jeni

Reputation: 968

Use comma separated character strings to plot a set of values for each row

I have this dataframe:

sample   start   type   source
  A      2,3,4    D2      BS
  B      5,6      D2      BS
  C      7,2,1    D3      AT
  D      8        D2      BS
  E      1,2,8,1  D3      BS
  F      4,3,3    D3      AT
  G      1,1      D2      BS
  H      2,9      D2      AT

And I would like to perform two boxplots of the start values, separating them into groups depending on the type value. The problem is that start column is a character column, so I don't know how to use it as if each different comma separated character of the string was a different number. How can I achieve this?

Upvotes: 1

Views: 375

Answers (3)

akrun
akrun

Reputation: 887711

We can use separate_rows from tidyr

library(dplyr)
library(tidyr)
df %>% 
    separate_rows(start)
# A tibble: 20 x 4
#   sample start type  source
#   <chr>  <chr> <chr> <chr> 
# 1 A      2     D2    BS    
# 2 A      3     D2    BS    
# 3 A      4     D2    BS    
# 4 B      5     D2    BS    
# 5 B      6     D2    BS    
# 6 C      7     D3    AT    
# 7 C      2     D3    AT    
# 8 C      1     D3    AT    
# 9 D      8     D2    BS    
#10 E      1     D3    BS    
#11 E      2     D3    BS    
#12 E      8     D3    BS    
#13 E      1     D3    BS    
#14 F      4     D3    AT    
#15 F      3     D3    AT    
#16 F      3     D3    AT    
#17 G      1     D2    BS    
#18 G      1     D2    BS    
#19 H      2     D2    AT    
#20 H      9     D2    AT    

data

df <- structure(list(sample = c("A", "B", "C", "D", "E", "F", "G", 
"H"), start = c("2,3,4", "5,6", "7,2,1", "8", "1,2,8,1", "4,3,3", 
"1,1", "2,9"), type = c("D2", "D2", "D3", "D2", "D3", "D3", "D2", 
"D2"), source = c("BS", "BS", "AT", "BS", "BS", "AT", "BS", "AT"
)), class = "data.frame", row.names = c(NA, -8L))

Upvotes: 1

Eyayaw
Eyayaw

Reputation: 1081

library(dplyr)
library(tidyr)
library(ggplot2)

df <- read.table(text = "sample   start   type   source
  A      2,3,4    D2      BS
  B      5,6      D2      BS
  C      7,2,1    D3      AT
  D      8        D2      BS
  E      1,2,8,1  D3      BS
  F      4,3,3    D3      AT
  G      1,1      D2      BS
  H      2,9      D2      AT", header = TRUE) 

   df <- separate(df, col = start, into = c("var1", "var2", "var3")) %>% 
    pivot_longer(cols = matches("var\\d"), values_to = "start" , values_drop_na = TRUE)


df

# A tibble: 19 x 5
   sample type  source name  start
   <chr>  <chr> <chr>  <chr> <chr>
 1 A      D2    BS     var1  2    
 2 A      D2    BS     var2  3    
 3 A      D2    BS     var3  4    
 4 B      D2    BS     var1  5    
 5 B      D2    BS     var2  6    
 6 C      D3    AT     var1  7    
 7 C      D3    AT     var2  2    
 8 C      D3    AT     var3  1    
 9 D      D2    BS     var1  8    
10 E      D3    BS     var1  1    
11 E      D3    BS     var2  2    
12 E      D3    BS     var3  8    
13 F      D3    AT     var1  4    
14 F      D3    AT     var2  3    
15 F      D3    AT     var3  3    
16 G      D2    BS     var1  1    
17 G      D2    BS     var2  1    
18 H      D2    AT     var1  2    
19 H      D2    AT     var2  9  
 
ggplot(df, aes(type, start, col = type)) +
    geom_boxplot()

Upvotes: 1

starja
starja

Reputation: 10375

You can use apply to map a function to every row of your data.frame. Then I use str_split to get all entries of start in a vector and generate a data.frame that repeats the other columns. apply returns a list of data.frames, so with do.call("rbind", df_new) you get one data.frame for plotting. Please note that from the 1,1 entry the 1 appears twice:

df <- read.table(text = "sample   start   type   source
  A      2,3,4    D2      BS
  B      5,6      D2      BS
  C      7,2,1    D3      AT
  D      8        D2      BS
  E      1,2,8,1  D3      BS
  F      4,3,3    D3      AT
  G      1,1      D2      BS
  H      2,9      D2      AT",
                 header = TRUE,
                 stringsAsFactors = FALSE)


library(stringr)

df_new <- apply(df, 1, function(x) {
  data <- data.frame(sample = x["sample"],
             start = as.numeric(str_split(x["start"], ",", simplify = TRUE)),
             type = x["type"],
             source = x["source"])
  rownames(data) <- NULL
  data
})

df_new <- do.call("rbind", df_new)

df_new
   sample start type source
1       A     2   D2     BS
2       A     3   D2     BS
3       A     4   D2     BS
4       B     5   D2     BS
5       B     6   D2     BS
6       C     7   D3     AT
7       C     2   D3     AT
8       C     1   D3     AT
9       D     8   D2     BS
10      E     1   D3     BS
11      E     2   D3     BS
12      E     8   D3     BS
13      E     1   D3     BS
14      F     4   D3     AT
15      F     3   D3     AT
16      F     3   D3     AT
17      G     1   D2     BS
18      G     1   D2     BS
19      H     2   D2     AT
20      H     9   D2     AT

library(ggplot2)
ggplot(df_new, aes(x = type, y = start, fill = type)) +
  geom_boxplot()

Upvotes: 2

Related Questions