SimonB
SimonB

Reputation: 700

Transform dataframe in R using dplyr

I have a dataframe df

df<- structure(list(year = structure(c(72L, 2L, 13L, 24L, 35L, 46L, 
57L, 68L, 70L, 71L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 
14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 25L, 26L, 27L, 
28L, 29L, 30L, 31L, 32L, 33L, 34L, 36L, 37L, 38L, 39L, 40L, 41L, 
42L, 43L, 44L, 45L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 
56L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 69L), .Label = c("day", 
"id.1.00000", "id.10.0000", "id.11.0000", "id.12.0000", "id.13.0000", 
"id.14.0000", "id.15.0000", "id.16.0000", "id.17.0000", "id.18.0000", 
"id.19.0000", "id.2.00000", "id.20.0000", "id.21.0000", "id.22.0000", 
"id.23.0000", "id.24.0000", "id.25.0000", "id.26.0000", "id.27.0000", 
"id.28.0000", "id.29.0000", "id.3.00000", "id.30.0000", "id.31.0000", 
"id.32.0000", "id.33.0000", "id.34.0000", "id.35.0000", "id.36.0000", 
"id.37.0000", "id.38.0000", "id.39.0000", "id.4.00000", "id.40.0000", 
"id.41.0000", "id.42.0000", "id.43.0000", "id.44.0000", "id.45.0000", 
"id.46.0000", "id.47.0000", "id.48.0000", "id.49.0000", "id.5.00000", 
"id.50.0000", "id.51.0000", "id.52.0000", "id.53.0000", "id.54.0000", 
"id.55.0000", "id.56.0000", "id.57.0000", "id.58.0000", "id.59.0000", 
"id.6.00000", "id.60.0000", "id.61.0000", "id.62.0000", "id.63.0000", 
"id.64.0000", "id.65.0000", "id.66.0000", "id.67.0000", "id.68.0000", 
"id.69.0000", "id.7.00000", "id.70.0000", "id.8.00000", "id.9.00000", 
"month"), class = "factor"), `2012` = c(1, 76.1, 38.1, 88.6, 
132.5, 241.9, 343.2, 343.2, 291.7, 82.8, 17.6, 17.6, 17.5, 17.6, 
17.6, 17.6, 17.5, 18.7, 20.4, 17.3, 16.8, 67.6, 66.2, 16.8, 16.8, 
16.8, 121.8, 0.1, 99.9, 101.5, 52, 81.6, 68.5, 37.6, 46.2, 23.4, 
31, 55.7, 24, 384.8, 53.3, 22.7, 18.6, 25.4, 42.9, 16.5, 75, 
69.7, 43.2, 90.9, 18.5, 53.1, 21.6, 66.9, 40.8, 153.3, 47.2, 
98.6, 75.8, 141.6, 9.8, 12.7, 12.7, 69.2, 18.3, 359, 153.4, 24.3, 
24.3, 24.3, 7.9), `2012` = c(2, 104.6, 41.4, 24.8, 25, 299.3, 
181.6, 181.6, 223, 49.1, 9.3, 9.3, 8.7, 9.3, 9.3, 9.3, 8.7, 9.6, 
5.2, 3.9, 4.4, 29.2, 30.3, 4.4, 4.4, 4.4, 49.1, 7.2, 27.7, 12.7, 
20.5, 15.2, 19.7, 4.8, 35.7, 31.6, 6, 7.4, 2.1, 452.5, 43.2, 
92.8, 62, 17.3, 8.4, 14.2, 94.5, 60.1, 28.3, 18.8, 0, 49.1, 11.4, 
47.7, 44.6, 73.3, 23, 62.9, 30.5, 95, 45.8, 42.4, 42.4, 31.1, 
33.8, 340, 63.8, 12.9, 12.9, 12.9, 8.3), `2012` = c(3, 196.1, 
75.8, 26.7, 27.5, 410.1, 287.8, 287.8, 254.9, 56.9, 24.6, 24.6, 
20.8, 24.6, 24.6, 24.6, 20.8, 13.3, 21.3, 20.7, 22.9, 41.2, 42.9, 
22.9, 22.9, 22.9, 63.4, 34.1, 15.6, 11.1, 8.5, 10.4, 8.1, 35.6, 
27, 20.2, 18.1, 33.7, 45.3, 363.2, 32.3, 28.9, 14.6, 10.7, 45.6, 
18.3, 159.2, 51.7, 33.6, 17.5, 52.8, 62.9, 20.5, 28.3, 21, 28.1, 
24.3, 48.1, 41.6, 179.6, 2.7, 36.9, 36.9, 59, 41.4, 349, 59.2, 
21.5, 21.5, 21.5, 31.7), `2012` = c(4, 27.5, 45.4, 87.6, 123.1, 
290.6, 99.9, 99.9, 103.9, 33.7, 24.9, 24.9, 21.2, 24.9, 24.9, 
24.9, 21.2, 15.6, 57.2, 48.3, 43.6, 51.8, 54.8, 43.6, 43.6, 43.6, 
67.8, 58.6, 26.5, 25.5, 26.6, 19.3, 24.2, 24.8, 43.8, 39.2, 67.4, 
190.8, 121.5, 462.7, 0, 79.8, 45.8, 23.6, 161.1, 120.7, 119.8, 
44.4, 62, 66.4, 63.1, 68, 19.4, 50.8, 63.7, 122.9, 119.3, 103.1, 
81.3, 97.1, 14.8, 36.6, 36.6, 35.3, 82.3, 313, 75.1, 18.6, 18.6, 
18.6, 67.1), `2012` = c(5, 72.5, 102.2, 66.2, 70.8, 214.6, 73.4, 
73.4, 69.7, 72.1, 35.5, 35.5, 31, 35.5, 35.5, 35.5, 31, 24.1, 
67.8, 67, 60.1, 82.3, 86, 60.1, 60.1, 60.1, 48.5, 62.7, 53.4, 
54.2, 42.9, 55.6, 19, 0, 47.5, 26.1, 91.5, 32.5, 51.6, 389.3, 
0, 30.5, 10.9, 40.4, 89.5, 62.9, 123.9, 71.2, 151.6, 69.4, 101.4, 
49.8, 27.1, 35, 19.6, 72.6, 31.7, 120, 143.3, 54.2, 8.8, 3.6, 
3.6, 33.3, 143.8, 221, 59.5, 25.3, 25.3, 25.3, 314.6), `2012` = c(6, 
101.8, 124.8, 117, 136.3, 194.3, 142.1, 142.1, 135.2, 87.9, 75.2, 
75.2, 72.7, 75.2, 75.2, 75.2, 72.7, 61.5, 101.8, 108.6, 103.7, 
125.8, 131.9, 103.7, 103.7, 103.7, 52.8, 200.5, 81.9, 103.6, 
68.5, 83.2, 65.2, 1.2, 79.6, 64.7, 76.3, 73, 35.6, 238.4, 0, 
13.6, 0.1, 26.7, 65, 17.5, 169.5, 36.3, 71.8, 86.4, 3.1, 117.9, 
23.6, 61.6, 93.3, 143.1, 122.6, 167, 96.7, 77.6, 33.5, 0, 0, 
94.8, 160, 220, 89.3, 23.3, 23.3, 23.3, 399.9), `2012` = c(7, 
62.8, 68.9, 108.8, 180.2, 189.2, 51.8, 51.8, 46.2, 50.2, 68.7, 
68.7, 64.1, 68.7, 68.7, 68.7, 64.1, 60, 107.8, 101.8, 90.7, 40.6, 
41.7, 90.7, 90.7, 90.7, 53.8, 289.2, 72.9, 86.2, 119.4, 66.8, 
59.4, 0, 97.5, 67.3, 63.4, 47.7, 33, 142, 0, 35.8, 17.9, 66.1, 
25.6, 0.6, 246.8, 109.1, 122.4, 89.7, 2.6, 44.6, 8, 91, 60.2, 
100.8, 99.8, 59.2, 112.9, 10.2, 66.5, 0.8, 0.8, 64.5, 131.4, 
173, 50.4, 7.3, 7.3, 7.3, 590), `2012` = c(8, 98.2, 81.1, 40.7, 
48.9, 61.8, 33.5, 33.5, 27.1, 73.7, 80.9, 80.9, 82.1, 80.9, 80.9, 
80.9, 82.1, 82, 75.9, 78.3, 75.5, 99.3, 97.2, 75.5, 75.5, 75.5, 
44.2, 240.8, 46.8, 44.7, 51.4, 45.2, 46, 2.6, 43.8, 23.5, 10.6, 
21.1, 40.1, 91.6, 0, 23.5, 18.6, 34.8, 48, 36.2, 124.6, 153.4, 
146.6, 87.2, 12.4, 98, 60.6, 69.1, 118.4, 115.1, 43.6, 108.1, 
117.8, 0, 5.8, 13.9, 13.9, 57.4, 79, 165, 67.5, 60, 60, 60, 357
), `2012` = c(9, 23.9, 38.9, 53.2, 60.9, 44.2, 17.3, 17.3, 17.4, 
54.2, 45.6, 45.6, 43.6, 45.6, 45.6, 45.6, 43.6, 44.8, 30.8, 27.2, 
27.2, 79.2, 84.3, 27.2, 27.2, 27.2, 95.1, 125.9, 42.7, 30.8, 
27.4, 43.1, 35.5, 49.2, 64.4, 22.5, 38.6, 60.6, 67.1, 20.3, 0, 
69, 9, 131.8, 56.1, 58.2, 272.8, 163.3, 198.6, 50.9, 67.1, 75.6, 
85.3, 72.4, 63.5, 126.9, 63.2, 118.7, 112, 0.2, 32.8, 10.3, 10.3, 
48.2, 30.2, 169, 125.8, 87, 87, 87, 512.9), `2012` = c(10, 81.6, 
29.2, 112.8, 119.8, 29.1, 330.5, 330.5, 295.4, 111, 60.3, 60.3, 
54.9, 60.3, 60.3, 60.3, 54.9, 43.8, 53.6, 63.8, 67.7, 84.4, 87.8, 
67.7, 67.7, 67.7, 166.6, 47.3, 38.3, 39.2, 44.2, 33, 46.6, 40.6, 
92.5, 60.3, 84.4, 122.7, 184.9, 24.1, 2.1, 48.4, 22.5, 81.8, 
111.3, 148.3, 119.7, 120, 142.3, 91.8, 150.6, 108.9, 55.9, 65.6, 
69.2, 135.9, 110.7, 114.6, 100, 103.4, 28.2, 38.4, 38.4, 127.2, 
87, 203, 172, 53.8, 53.8, 53.8, 258.7), `2012` = c(11, 92.3, 
60.8, 41.8, 68.3, 43.6, 284.6, 284.6, 276.4, 45.9, 23.7, 23.7, 
21.4, 23.7, 23.7, 23.7, 21.4, 17.1, 27.2, 26.5, 26.5, 33.4, 36.8, 
26.5, 26.5, 26.5, 31.8, 43.9, 67.4, 46.6, 69.8, 62.5, 40.3, 68.2, 
38.8, 54.6, 39.2, 81.9, 66.7, 73.1, 56.9, 177.4, 194.8, 136.5, 
184.8, 183.7, 126.6, 158.5, 200.9, 37.2, 238.5, 84.3, 51.5, 48, 
56.3, 103.3, 101, 20.8, 39.4, 161.3, 2.6, 9, 9, 37.6, 33.5, 223, 
120.2, 49.6, 49.6, 49.6, 54.7), `2012` = c(12, 34, 36.5, 156.7, 
195.4, 132.6, 310.4, 310.4, 271.8, 87.6, 15.3, 15.3, 13.2, 15.3, 
15.3, 15.3, 13.2, 10.3, 17.5, 15.8, 14.8, 77.5, 74.5, 14.8, 14.8, 
14.8, 122.4, 5.1, 81.1, 74.5, 58.7, 69.1, 55.8, 3.3, 48.1, 33.3, 
83.9, 173.5, 30.4, 169.4, 41.7, 86.6, 67.9, 36.3, 58.8, 121.9, 
100.7, 163.3, 155.5, 135.4, 255.8, 60, 18.7, 67.9, 57.8, 150.3, 
136.2, 109.2, 123.3, 179.5, 35.3, 92.4, 92.4, 71.4, 50.5, 237, 
135.4, 16.5, 16.5, 16.5, 1.1)), .Names = c("year", "2012", "2012", 
"2012", "2012", "2012", "2012", "2012", "2012", "2012", "2012", 
"2012", "2012"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 
36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L, 
49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 
62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L), class = "data.frame")

I want to transform my dataframe using the R package dplyr. So far, year and month are in rows but I want them in columns. Basically I want a dataframe with a column containing the IDs, one with year, one with month and one with the values. The structure will look like this with an ID column additional.

df2<-structure(list(YEAR = c(1980L, 1980L, 1980L, 1980L, 1980L, 1980L, 
1980L, 1980L, 1980L, 1980L), MONTH = 1:10, PRCP = c(46.3, 20.7, 
101.3, 27.2, 67.5, 34.1, 12, 95.4, 17, 31.8)), .Names = c("YEAR", 
"MONTH", "PRCP"), row.names = c(NA, 10L), class = "data.frame")

Can someone help me out? Thanks

Upvotes: 0

Views: 1007

Answers (3)

Konrad
Konrad

Reputation: 18585

Here is the code using / combination and (for convenience).

Vectorize(require)(package = c("tidyr",        # Data manipulation
                               "dplyr",        # Data manipulation
                               "magrittr"      # Reverse pipe
),
char = TRUE)

dfTr <- df %>%
  gather(key = key, value = value, convert = TRUE, -year) %>% 
  setNames(make.names(names(.), unique = TRUE)) %>% 
  select(year, key, value) 

You can ad the following syntax to obtain one id column:

  # Unite is optional as it will generate one ID column
  unite(year_month, key, year)

which would produce:

        year  key value
1      month 2012   1.0
2 id.1.00000 2012  76.1
3 id.2.00000 2012  38.1

With exclusion

Sample approach excluding that month row:

dfTr2 <- df %>%
  setNames(make.names(names(.), unique = TRUE)) %>% 
  filter(year != "month") %>% 
  gather(key = key, value = value, convert = TRUE, -year) %>% 
  select(year, key, value) %>% 
  unite(year_month, key, year) %>% 
  mutate(year_month = gsub("X","",year_month))

Cleaning month value

It crossed my mind that you may want to use the month figure, a small example below extracts the relevant digit:

library(stringr)
dfTr3 <- df %>%
  setNames(make.names(names(.), unique = TRUE)) %>% 
  filter(year != "month") %>% 
  gather(key = key, value = value, convert = TRUE, -year) %>% 
  select(year, key, value) %>% 
  unite(year_month, key, year) %>% 
  mutate(year_month = gsub("X","",year_month)) %>% 
  mutate(month = gsub(".", "", str_extract_all(string = year_month, 
                                 pattern = "\\.\\d{1,2}\\."), 
         fixed = TRUE))

which produces:

> head(dfTr3, n = 2)
       year_month value month
1 2012_id.1.00000  76.1     1
2 2012_id.2.00000  38.1     2

Upvotes: 3

user295691
user295691

Reputation: 7248

Your data is in an odd form; it appears that you have two header rows, which is not something supported in an R data frame. Most likely there was something wrong with the initial import, but I can see what it is you are trying to do.

My solution would be to first fix up the data frame so that it shows what we want, which is for the headers to be the combination of the current headers (year) and the first row (month)

> paste(names(df), df[1,])
 [1] "year 72" "2012 1"  "2012 2"  "2012 3"  "2012 4"  "2012 5"  "2012 6"
 [8] "2012 7"  "2012 8"  "2012 9"  "2012 10" "2012 11" "2012 12"

That looks good (except for "year 72", which we'll fix up later), so lets set it:

> names(df) <- paste(names(df), df[1,])
> df %>% head
     year 72 2012 1 2012 2 2012 3 2012 4 2012 5 2012 6 2012 7 2012 8 2012 9
1      month    1.0    2.0    3.0    4.0    5.0    6.0    7.0    8.0    9.0
3 id.1.00000   76.1  104.6  196.1   27.5   72.5  101.8   62.8   98.2   23.9
4 id.2.00000   38.1   41.4   75.8   45.4  102.2  124.8   68.9   81.1   38.9

Now let's get rid of that first row

> df <- tail(df, -1)
> head(df)
      year 72 2012 1 2012 2 2012 3 2012 4 2012 5 2012 6 2012 7 2012 8 2012 9
3  id.1.00000   76.1  104.6  196.1   27.5   72.5  101.8   62.8   98.2   23.9
4  id.2.00000   38.1   41.4   75.8   45.4  102.2  124.8   68.9   81.1   38.9

Now we can use tidyr, a companion package to dplyr for reshaping data frames, to put it into the desired form.

> require(tidyr)
> df %>% gather(date, value, -`year 72`) %>% head
       year 72    date value
1   id.1.00000  2012 1  76.1
2   id.2.00000  2012 1  38.1
3   id.3.00000  2012 1  88.6
4   id.4.00000  2012 1 132.5
5   id.5.00000  2012 1 241.9

From here, we can use separate to break the "date" column into "year" and "month" columns.

Upvotes: 3

user2600629
user2600629

Reputation: 561

This should do the trick!

rownames(df) <- seq(length=nrow(df))
colnames(df)[2:13] <- c(1:12)
tmp <- melt(df[2:nrow(df),],id = c('year'))
colnames(tmp) <- c('id','Month','Value')
library(sqldf)
sqldf("Select * from tmp order by id")

Upvotes: 2

Related Questions