Ahmad Noman Alnoor
Ahmad Noman Alnoor

Reputation: 107

gathering the iris data set. in R

I want to modify the iris data set in R. It has 5 variables: species, sepal.length, sepal.width, petal.length and petal.width. I need a new column called part which specifies if it's the sepal or the petal and length and width columns which show the measurements. the result should look like this:

image

I also need make another one where there is a column called measure which indicates the type of measurement (length or width) and shows with the species as variables. It should look like this:

How do I do this using tidyr?

Upvotes: 1

Views: 1124

Answers (4)

olivroy
olivroy

Reputation: 829

library(tidyverse)
# Pivotting iris to long data
long_iris <- iris |> 
  mutate(id = row_number()) |> 
  pivot_longer(
    cols = !c(id, Species), 
    names_to = c("Part", "Measure"),
    names_sep = "\\." # Iris variable separator regex
  )
long_iris
#> # A tibble: 600 x 5
#>    Species    id Part  Measure value
#>    <fct>   <int> <chr> <chr>   <dbl>
#>  1 setosa      1 Sepal Length    5.1
#>  2 setosa      1 Sepal Width     3.5
#>  3 setosa      1 Petal Length    1.4
#>  4 setosa      1 Petal Width     0.2
#>  5 setosa      2 Sepal Length    4.9
#>  6 setosa      2 Sepal Width     3  
#>  7 setosa      2 Petal Length    1.4
#>  8 setosa      2 Petal Width     0.2
#>  9 setosa      3 Sepal Length    4.7
#> 10 setosa      3 Sepal Width     3.2
#> # ... with 590 more rows

# Using the long data, we can repivot to a long format

iris_length_width <- long_iris |> 
  pivot_wider(
    id_cols = c(id, Species, Part),
    names_from = Measure,
    values_from = value
  )
# This achieves the same thing
iris |> 
  mutate(id = row_number()) |> 
  pivot_longer(
    cols = !c(id, Species), 
    names_to = c("Part", ".value"),
    names_sep = "\\." # Iris variable separator regex
  )
#> # A tibble: 300 x 5
#>    Species    id Part  Length Width
#>    <fct>   <int> <chr>  <dbl> <dbl>
#>  1 setosa      1 Sepal    5.1   3.5
#>  2 setosa      1 Petal    1.4   0.2
#>  3 setosa      2 Sepal    4.9   3  
#>  4 setosa      2 Petal    1.4   0.2
#>  5 setosa      3 Sepal    4.7   3.2
#>  6 setosa      3 Petal    1.3   0.2
#>  7 setosa      4 Sepal    4.6   3.1
#>  8 setosa      4 Petal    1.5   0.2
#>  9 setosa      5 Sepal    5     3.6
#> 10 setosa      5 Petal    1.4   0.2
#> # ... with 290 more rows

2nd part

long_iris
#> # A tibble: 600 x 5
#>    Species    id Part  Measure value
#>    <fct>   <int> <chr> <chr>   <dbl>
#>  1 setosa      1 Sepal Length    5.1
#>  2 setosa      1 Sepal Width     3.5
#>  3 setosa      1 Petal Length    1.4
#>  4 setosa      1 Petal Width     0.2
#>  5 setosa      2 Sepal Length    4.9
#>  6 setosa      2 Sepal Width     3  
#>  7 setosa      2 Petal Length    1.4
#>  8 setosa      2 Petal Width     0.2
#>  9 setosa      3 Sepal Length    4.7
#> 10 setosa      3 Sepal Width     3.2
#> # ... with 590 more rows

# creating a row id by group to create the second output
long_iris |> 
  group_by(Species) |> 
  mutate(id = row_number()) |> 
  ungroup() |> 
  pivot_wider(
    id_cols = c(id, Part, Measure),
    names_from = Species,
    values_from = value
  ) |> 
  arrange(Measure, Part) |> 
  select(-id)
#> # A tibble: 200 x 5
#>    Part  Measure setosa versicolor virginica
#>    <chr> <chr>    <dbl>      <dbl>     <dbl>
#>  1 Petal Length     1.4        4.7       6  
#>  2 Petal Length     1.4        4.5       5.1
#>  3 Petal Length     1.3        4.9       5.9
#>  4 Petal Length     1.5        4         5.6
#>  5 Petal Length     1.4        4.6       5.8
#>  6 Petal Length     1.7        4.5       6.6
#>  7 Petal Length     1.4        4.7       4.5
#>  8 Petal Length     1.5        3.3       6.3
#>  9 Petal Length     1.4        4.6       5.8
#> 10 Petal Length     1.5        3.9       6.1
#> # ... with 190 more rows

Created on 2022-06-15 by the reprex package (v2.0.1)

Upvotes: 0

Len Greski
Len Greski

Reputation: 10855

Here is an approach where we make a narrow format tidy data set to start, and then use pivot_wider() to get the result with Length and Width columns.

library(tidyr)
# add an ID variable so we can pivot_wider and match measurement for correct observations
id <- 1:nrow(iris)
data <- cbind(id,iris)
data %>%  gather(.,key = "part.measurement",value = "value",-id,-Species) %>%
     separate(.,part.measurement,c("part","measurement")) -> narrow_data
 head(narrow_data[2:5])

> head(narrow_data[2:5])
  Species  part measurement value
1  setosa Sepal      Length   5.1
2  setosa Sepal      Length   4.9
3  setosa Sepal      Length   4.7
4  setosa Sepal      Length   4.6
5  setosa Sepal      Length   5.0
6  setosa Sepal      Length   5.4

At this point we can use pivot_wider() to create the Length and Width columns. We'll add an arrange() so the sort order matches the image posted with the question.

narrow_data %>% pivot_wider(.,names_from = measurement,values_from = value) %>%
     arrange(Species,part)-> wide_data
head(wide_data[2:5]) 

...and the output:

> head(wide_data[2:5])
# A tibble: 6 x 4
  Species part  Length Width
  <fct>   <chr>  <dbl> <dbl>
1 setosa  Petal    1.4   0.2
2 setosa  Petal    1.4   0.2
3 setosa  Petal    1.3   0.2
4 setosa  Petal    1.5   0.2
5 setosa  Petal    1.4   0.2
6 setosa  Petal    1.7   0.4
> 

The second output is tricky because it essentially merges the 200 observations of part and measurement for each species of flower into an output tibble of 200 rows, one for each combination of part and length for each of the 50 observations of each Species.

#  reproduce 2nd output
speciesId <- c(1:200,1:200,1:200) # unique obs within species
narrow_species_data <- cbind(speciesId,narrow_data[order(narrow_data[,1],narrow_data[,3],narrow_data[,4]),c(2:5)])

narrow_species_data %>% pivot_wider(.,names_from= Species,values_from = value) %>%
     arrange(part,measurement,speciesId) -> wide_data_species
head(wide_data_species[2:6])

...and the output:

>  head(wide_data_species[2:6])
# A tibble: 6 x 5
  part  measurement setosa versicolor virginica
  <chr> <chr>        <dbl>      <dbl>     <dbl>
1 Petal Length         1.4        4.7       6  
2 Petal Length         1.4        4.5       5.1
3 Petal Length         1.3        4.9       5.9
4 Petal Length         1.5        4         5.6
5 Petal Length         1.4        4.6       5.8
6 Petal Length         1.7        4.5       6.6
>

A "completely tidyverse" version

Here is a version of both parts of the question that solely uses features from the tidyverse family of packages.

For the first question, we use mutate() and seq_along() to create unique sequential numbers to identify each observation in the original data. We create a narrow form tidy data set with gather(), and then convert it into the desired output with pivot_wider(). To match the order of observations from the image in the original question, we arrange(Species,part).

library(tidyr)
library(dplyr)
# add an ID variable so we can pivot_wider and match measurement for correct observations
iris %>% mutate(id = seq_along(Species)) %>%  gather(.,key = "part.measurement",value = "value",-id,-Species) %>%
     separate(.,part.measurement,c("part","measurement")) -> narrow_data

narrow_data %>% pivot_wider(.,names_from = measurement,values_from = value) %>% 
     arrange(Species,part) -> wide_data
head(wide_data[2:5])

...and the output:

> head(wide_data[2:5])
# A tibble: 6 x 4
     id part  Length Width
  <int> <chr>  <dbl> <dbl>
1     1 Petal    1.4   0.2
2     2 Petal    1.4   0.2
3     3 Petal    1.3   0.2
4     4 Petal    1.5   0.2
5     5 Petal    1.4   0.2
6     6 Petal    1.7   0.4
> 

For the second question, instead of building a vector of sequential IDs for each species and using cbind() it to the rest of the data, we can use dplyr functions to create the sequences within a pipeline.

We use arrange() to sort the data by Species, id, part, and measurement. Then we group_by(Species) so we can use mutate() to create a unique sequential ID with seq_along(). The sort order is important because we want to merge the 1st observation with the 51st observation and the 101st observation.

Then we ungroup() to clear the group_by() and use pivot_wider() with id_cols = speciesId to create the desired output.

narrow_data %>% arrange(Species,id,part,measurement) %>% group_by(Species) %>% mutate(speciesId = seq_along(Species)) %>%
   ungroup(.) %>% pivot_wider(.,id_cols=c("speciesId","part","measurement"),names_from= Species,values_from = value) %>%
     arrange(part,measurement,speciesId) -> wide_data_species
head(wide_data_species[2:6])

...and the output:

> head(wide_data_species[2:6])
# A tibble: 6 x 5
  part  measurement setosa versicolor virginica
  <chr> <chr>        <dbl>      <dbl>     <dbl>
1 Petal Length         1.4        4.7       6  
2 Petal Length         1.4        4.5       5.1
3 Petal Length         1.3        4.9       5.9
4 Petal Length         1.5        4         5.6
5 Petal Length         1.4        4.6       5.8
6 Petal Length         1.7        4.5       6.6
>

Upvotes: 2

lroha
lroha

Reputation: 34441

This can be done solely with tidyr functions:

First step:

(first <- iris %>%
  pivot_longer(cols = -Species, names_sep = "\\.", names_to = c("Part", ".value")))

# A tibble: 300 x 4
   Species Part  Length Width
   <fct>   <chr>  <dbl> <dbl>
 1 setosa  Sepal    5.1   3.5
 2 setosa  Petal    1.4   0.2
 3 setosa  Sepal    4.9   3  
 4 setosa  Petal    1.4   0.2
 5 setosa  Sepal    4.7   3.2
 6 setosa  Petal    1.3   0.2
 7 setosa  Sepal    4.6   3.1
 8 setosa  Petal    1.5   0.2
 9 setosa  Sepal    5     3.6
10 setosa  Petal    1.4   0.2
# ... with 290 more rows

Second step:

first %>%
   pivot_longer(cols = c(Length, Width), names_to = "Measure") %>%
   pivot_wider(names_from = Species, values_from = value, values_fn = list(value = list)) %>%
   unnest(cols = -c(Part, Measure))

# A tibble: 200 x 5
   Part  Measure setosa versicolor virginica
   <chr> <chr>    <dbl>      <dbl>     <dbl>
 1 Sepal Length     5.1        7         6.3
 2 Sepal Length     4.9        6.4       5.8
 3 Sepal Length     4.7        6.9       7.1
 4 Sepal Length     4.6        5.5       6.3
 5 Sepal Length     5          6.5       6.5
 6 Sepal Length     5.4        5.7       7.6
 7 Sepal Length     4.6        6.3       4.9
 8 Sepal Length     5          4.9       7.3
 9 Sepal Length     4.4        6.6       6.7
10 Sepal Length     4.9        5.2       7.2
# ... with 190 more rows

Upvotes: 2

denisafonin
denisafonin

Reputation: 1136

This is what I can suggest to achieve the first result:

df <- iris

# Changing column order
df <- df %>%
  select(5, 1:4)

Selecting Species, Petal.Length, Sepal.Length columns and gathering:

length <- df %>% 
  select(1,2,4) %>%
  gather("Part", "Length", -1)

length$Part <- gsub(pattern = ".Length", replacement = "", length$Part, )

head(length)
  Species  Part Length
1  setosa Sepal    5.1
2  setosa Sepal    4.9
3  setosa Sepal    4.7
4  setosa Sepal    4.6
5  setosa Sepal    5.0
6  setosa Sepal    5.4

Selecting Species, Petal.Width, Sepal.Width columns and gathering:

width <- df %>% 
  select(1,3,5) %>%
  gather("Part", "Width", -1)

width$Part <- gsub(pattern = ".Width", replacement = "", width$Part, )

head(width)
  Species  Part Width
1  setosa Sepal   3.5
2  setosa Sepal   3.0
3  setosa Sepal   3.2
4  setosa Sepal   3.1
5  setosa Sepal   3.6
6  setosa Sepal   3.9

Combinig the 2 datasets:

merged_df <- length %>%
  mutate(Width = width$Width)

head(merged_df)
  Species  Part Length Width
1  setosa Sepal    5.1   3.5
2  setosa Sepal    4.9   3.0
3  setosa Sepal    4.7   3.2
4  setosa Sepal    4.6   3.1
5  setosa Sepal    5.0   3.6
6  setosa Sepal    5.4   3.9

Upvotes: 1

Related Questions