Reputation: 700
I have a dataframe df
df<- structure(list(year = structure(c(72L, 2L, 13L, 24L, 35L, 46L,
57L, 68L, 70L, 71L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 25L, 26L, 27L,
28L, 29L, 30L, 31L, 32L, 33L, 34L, 36L, 37L, 38L, 39L, 40L, 41L,
42L, 43L, 44L, 45L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L,
56L, 58L, 59L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 69L), .Label = c("day",
"id.1.00000", "id.10.0000", "id.11.0000", "id.12.0000", "id.13.0000",
"id.14.0000", "id.15.0000", "id.16.0000", "id.17.0000", "id.18.0000",
"id.19.0000", "id.2.00000", "id.20.0000", "id.21.0000", "id.22.0000",
"id.23.0000", "id.24.0000", "id.25.0000", "id.26.0000", "id.27.0000",
"id.28.0000", "id.29.0000", "id.3.00000", "id.30.0000", "id.31.0000",
"id.32.0000", "id.33.0000", "id.34.0000", "id.35.0000", "id.36.0000",
"id.37.0000", "id.38.0000", "id.39.0000", "id.4.00000", "id.40.0000",
"id.41.0000", "id.42.0000", "id.43.0000", "id.44.0000", "id.45.0000",
"id.46.0000", "id.47.0000", "id.48.0000", "id.49.0000", "id.5.00000",
"id.50.0000", "id.51.0000", "id.52.0000", "id.53.0000", "id.54.0000",
"id.55.0000", "id.56.0000", "id.57.0000", "id.58.0000", "id.59.0000",
"id.6.00000", "id.60.0000", "id.61.0000", "id.62.0000", "id.63.0000",
"id.64.0000", "id.65.0000", "id.66.0000", "id.67.0000", "id.68.0000",
"id.69.0000", "id.7.00000", "id.70.0000", "id.8.00000", "id.9.00000",
"month"), class = "factor"), `2012` = c(1, 76.1, 38.1, 88.6,
132.5, 241.9, 343.2, 343.2, 291.7, 82.8, 17.6, 17.6, 17.5, 17.6,
17.6, 17.6, 17.5, 18.7, 20.4, 17.3, 16.8, 67.6, 66.2, 16.8, 16.8,
16.8, 121.8, 0.1, 99.9, 101.5, 52, 81.6, 68.5, 37.6, 46.2, 23.4,
31, 55.7, 24, 384.8, 53.3, 22.7, 18.6, 25.4, 42.9, 16.5, 75,
69.7, 43.2, 90.9, 18.5, 53.1, 21.6, 66.9, 40.8, 153.3, 47.2,
98.6, 75.8, 141.6, 9.8, 12.7, 12.7, 69.2, 18.3, 359, 153.4, 24.3,
24.3, 24.3, 7.9), `2012` = c(2, 104.6, 41.4, 24.8, 25, 299.3,
181.6, 181.6, 223, 49.1, 9.3, 9.3, 8.7, 9.3, 9.3, 9.3, 8.7, 9.6,
5.2, 3.9, 4.4, 29.2, 30.3, 4.4, 4.4, 4.4, 49.1, 7.2, 27.7, 12.7,
20.5, 15.2, 19.7, 4.8, 35.7, 31.6, 6, 7.4, 2.1, 452.5, 43.2,
92.8, 62, 17.3, 8.4, 14.2, 94.5, 60.1, 28.3, 18.8, 0, 49.1, 11.4,
47.7, 44.6, 73.3, 23, 62.9, 30.5, 95, 45.8, 42.4, 42.4, 31.1,
33.8, 340, 63.8, 12.9, 12.9, 12.9, 8.3), `2012` = c(3, 196.1,
75.8, 26.7, 27.5, 410.1, 287.8, 287.8, 254.9, 56.9, 24.6, 24.6,
20.8, 24.6, 24.6, 24.6, 20.8, 13.3, 21.3, 20.7, 22.9, 41.2, 42.9,
22.9, 22.9, 22.9, 63.4, 34.1, 15.6, 11.1, 8.5, 10.4, 8.1, 35.6,
27, 20.2, 18.1, 33.7, 45.3, 363.2, 32.3, 28.9, 14.6, 10.7, 45.6,
18.3, 159.2, 51.7, 33.6, 17.5, 52.8, 62.9, 20.5, 28.3, 21, 28.1,
24.3, 48.1, 41.6, 179.6, 2.7, 36.9, 36.9, 59, 41.4, 349, 59.2,
21.5, 21.5, 21.5, 31.7), `2012` = c(4, 27.5, 45.4, 87.6, 123.1,
290.6, 99.9, 99.9, 103.9, 33.7, 24.9, 24.9, 21.2, 24.9, 24.9,
24.9, 21.2, 15.6, 57.2, 48.3, 43.6, 51.8, 54.8, 43.6, 43.6, 43.6,
67.8, 58.6, 26.5, 25.5, 26.6, 19.3, 24.2, 24.8, 43.8, 39.2, 67.4,
190.8, 121.5, 462.7, 0, 79.8, 45.8, 23.6, 161.1, 120.7, 119.8,
44.4, 62, 66.4, 63.1, 68, 19.4, 50.8, 63.7, 122.9, 119.3, 103.1,
81.3, 97.1, 14.8, 36.6, 36.6, 35.3, 82.3, 313, 75.1, 18.6, 18.6,
18.6, 67.1), `2012` = c(5, 72.5, 102.2, 66.2, 70.8, 214.6, 73.4,
73.4, 69.7, 72.1, 35.5, 35.5, 31, 35.5, 35.5, 35.5, 31, 24.1,
67.8, 67, 60.1, 82.3, 86, 60.1, 60.1, 60.1, 48.5, 62.7, 53.4,
54.2, 42.9, 55.6, 19, 0, 47.5, 26.1, 91.5, 32.5, 51.6, 389.3,
0, 30.5, 10.9, 40.4, 89.5, 62.9, 123.9, 71.2, 151.6, 69.4, 101.4,
49.8, 27.1, 35, 19.6, 72.6, 31.7, 120, 143.3, 54.2, 8.8, 3.6,
3.6, 33.3, 143.8, 221, 59.5, 25.3, 25.3, 25.3, 314.6), `2012` = c(6,
101.8, 124.8, 117, 136.3, 194.3, 142.1, 142.1, 135.2, 87.9, 75.2,
75.2, 72.7, 75.2, 75.2, 75.2, 72.7, 61.5, 101.8, 108.6, 103.7,
125.8, 131.9, 103.7, 103.7, 103.7, 52.8, 200.5, 81.9, 103.6,
68.5, 83.2, 65.2, 1.2, 79.6, 64.7, 76.3, 73, 35.6, 238.4, 0,
13.6, 0.1, 26.7, 65, 17.5, 169.5, 36.3, 71.8, 86.4, 3.1, 117.9,
23.6, 61.6, 93.3, 143.1, 122.6, 167, 96.7, 77.6, 33.5, 0, 0,
94.8, 160, 220, 89.3, 23.3, 23.3, 23.3, 399.9), `2012` = c(7,
62.8, 68.9, 108.8, 180.2, 189.2, 51.8, 51.8, 46.2, 50.2, 68.7,
68.7, 64.1, 68.7, 68.7, 68.7, 64.1, 60, 107.8, 101.8, 90.7, 40.6,
41.7, 90.7, 90.7, 90.7, 53.8, 289.2, 72.9, 86.2, 119.4, 66.8,
59.4, 0, 97.5, 67.3, 63.4, 47.7, 33, 142, 0, 35.8, 17.9, 66.1,
25.6, 0.6, 246.8, 109.1, 122.4, 89.7, 2.6, 44.6, 8, 91, 60.2,
100.8, 99.8, 59.2, 112.9, 10.2, 66.5, 0.8, 0.8, 64.5, 131.4,
173, 50.4, 7.3, 7.3, 7.3, 590), `2012` = c(8, 98.2, 81.1, 40.7,
48.9, 61.8, 33.5, 33.5, 27.1, 73.7, 80.9, 80.9, 82.1, 80.9, 80.9,
80.9, 82.1, 82, 75.9, 78.3, 75.5, 99.3, 97.2, 75.5, 75.5, 75.5,
44.2, 240.8, 46.8, 44.7, 51.4, 45.2, 46, 2.6, 43.8, 23.5, 10.6,
21.1, 40.1, 91.6, 0, 23.5, 18.6, 34.8, 48, 36.2, 124.6, 153.4,
146.6, 87.2, 12.4, 98, 60.6, 69.1, 118.4, 115.1, 43.6, 108.1,
117.8, 0, 5.8, 13.9, 13.9, 57.4, 79, 165, 67.5, 60, 60, 60, 357
), `2012` = c(9, 23.9, 38.9, 53.2, 60.9, 44.2, 17.3, 17.3, 17.4,
54.2, 45.6, 45.6, 43.6, 45.6, 45.6, 45.6, 43.6, 44.8, 30.8, 27.2,
27.2, 79.2, 84.3, 27.2, 27.2, 27.2, 95.1, 125.9, 42.7, 30.8,
27.4, 43.1, 35.5, 49.2, 64.4, 22.5, 38.6, 60.6, 67.1, 20.3, 0,
69, 9, 131.8, 56.1, 58.2, 272.8, 163.3, 198.6, 50.9, 67.1, 75.6,
85.3, 72.4, 63.5, 126.9, 63.2, 118.7, 112, 0.2, 32.8, 10.3, 10.3,
48.2, 30.2, 169, 125.8, 87, 87, 87, 512.9), `2012` = c(10, 81.6,
29.2, 112.8, 119.8, 29.1, 330.5, 330.5, 295.4, 111, 60.3, 60.3,
54.9, 60.3, 60.3, 60.3, 54.9, 43.8, 53.6, 63.8, 67.7, 84.4, 87.8,
67.7, 67.7, 67.7, 166.6, 47.3, 38.3, 39.2, 44.2, 33, 46.6, 40.6,
92.5, 60.3, 84.4, 122.7, 184.9, 24.1, 2.1, 48.4, 22.5, 81.8,
111.3, 148.3, 119.7, 120, 142.3, 91.8, 150.6, 108.9, 55.9, 65.6,
69.2, 135.9, 110.7, 114.6, 100, 103.4, 28.2, 38.4, 38.4, 127.2,
87, 203, 172, 53.8, 53.8, 53.8, 258.7), `2012` = c(11, 92.3,
60.8, 41.8, 68.3, 43.6, 284.6, 284.6, 276.4, 45.9, 23.7, 23.7,
21.4, 23.7, 23.7, 23.7, 21.4, 17.1, 27.2, 26.5, 26.5, 33.4, 36.8,
26.5, 26.5, 26.5, 31.8, 43.9, 67.4, 46.6, 69.8, 62.5, 40.3, 68.2,
38.8, 54.6, 39.2, 81.9, 66.7, 73.1, 56.9, 177.4, 194.8, 136.5,
184.8, 183.7, 126.6, 158.5, 200.9, 37.2, 238.5, 84.3, 51.5, 48,
56.3, 103.3, 101, 20.8, 39.4, 161.3, 2.6, 9, 9, 37.6, 33.5, 223,
120.2, 49.6, 49.6, 49.6, 54.7), `2012` = c(12, 34, 36.5, 156.7,
195.4, 132.6, 310.4, 310.4, 271.8, 87.6, 15.3, 15.3, 13.2, 15.3,
15.3, 15.3, 13.2, 10.3, 17.5, 15.8, 14.8, 77.5, 74.5, 14.8, 14.8,
14.8, 122.4, 5.1, 81.1, 74.5, 58.7, 69.1, 55.8, 3.3, 48.1, 33.3,
83.9, 173.5, 30.4, 169.4, 41.7, 86.6, 67.9, 36.3, 58.8, 121.9,
100.7, 163.3, 155.5, 135.4, 255.8, 60, 18.7, 67.9, 57.8, 150.3,
136.2, 109.2, 123.3, 179.5, 35.3, 92.4, 92.4, 71.4, 50.5, 237,
135.4, 16.5, 16.5, 16.5, 1.1)), .Names = c("year", "2012", "2012",
"2012", "2012", "2012", "2012", "2012", "2012", "2012", "2012",
"2012", "2012"), row.names = c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L, 46L, 47L, 48L,
49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L,
62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L, 70L, 71L, 72L), class = "data.frame")
I want to transform my dataframe using the R package dplyr
. So far, year and month are in rows but I want them in columns. Basically I want a dataframe with a column containing the IDs, one with year, one with month and one with the values.
The structure will look like this with an ID column additional.
df2<-structure(list(YEAR = c(1980L, 1980L, 1980L, 1980L, 1980L, 1980L,
1980L, 1980L, 1980L, 1980L), MONTH = 1:10, PRCP = c(46.3, 20.7,
101.3, 27.2, 67.5, 34.1, 12, 95.4, 17, 31.8)), .Names = c("YEAR",
"MONTH", "PRCP"), row.names = c(NA, 10L), class = "data.frame")
Can someone help me out? Thanks
Upvotes: 0
Views: 1007
Reputation: 18585
Here is the code using dplyr / tidyr combination and magrittr (for convenience).
Vectorize(require)(package = c("tidyr", # Data manipulation
"dplyr", # Data manipulation
"magrittr" # Reverse pipe
),
char = TRUE)
dfTr <- df %>%
gather(key = key, value = value, convert = TRUE, -year) %>%
setNames(make.names(names(.), unique = TRUE)) %>%
select(year, key, value)
You can ad the following syntax to obtain one id
column:
# Unite is optional as it will generate one ID column
unite(year_month, key, year)
which would produce:
year key value
1 month 2012 1.0
2 id.1.00000 2012 76.1
3 id.2.00000 2012 38.1
Sample approach excluding that month
row:
dfTr2 <- df %>%
setNames(make.names(names(.), unique = TRUE)) %>%
filter(year != "month") %>%
gather(key = key, value = value, convert = TRUE, -year) %>%
select(year, key, value) %>%
unite(year_month, key, year) %>%
mutate(year_month = gsub("X","",year_month))
It crossed my mind that you may want to use the month figure, a small example below extracts the relevant digit:
library(stringr)
dfTr3 <- df %>%
setNames(make.names(names(.), unique = TRUE)) %>%
filter(year != "month") %>%
gather(key = key, value = value, convert = TRUE, -year) %>%
select(year, key, value) %>%
unite(year_month, key, year) %>%
mutate(year_month = gsub("X","",year_month)) %>%
mutate(month = gsub(".", "", str_extract_all(string = year_month,
pattern = "\\.\\d{1,2}\\."),
fixed = TRUE))
which produces:
> head(dfTr3, n = 2)
year_month value month
1 2012_id.1.00000 76.1 1
2 2012_id.2.00000 38.1 2
Upvotes: 3
Reputation: 7248
Your data is in an odd form; it appears that you have two header rows, which is not something supported in an R data frame. Most likely there was something wrong with the initial import, but I can see what it is you are trying to do.
My solution would be to first fix up the data frame so that it shows what we want, which is for the headers to be the combination of the current headers (year) and the first row (month)
> paste(names(df), df[1,])
[1] "year 72" "2012 1" "2012 2" "2012 3" "2012 4" "2012 5" "2012 6"
[8] "2012 7" "2012 8" "2012 9" "2012 10" "2012 11" "2012 12"
That looks good (except for "year 72", which we'll fix up later), so lets set it:
> names(df) <- paste(names(df), df[1,])
> df %>% head
year 72 2012 1 2012 2 2012 3 2012 4 2012 5 2012 6 2012 7 2012 8 2012 9
1 month 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0
3 id.1.00000 76.1 104.6 196.1 27.5 72.5 101.8 62.8 98.2 23.9
4 id.2.00000 38.1 41.4 75.8 45.4 102.2 124.8 68.9 81.1 38.9
Now let's get rid of that first row
> df <- tail(df, -1)
> head(df)
year 72 2012 1 2012 2 2012 3 2012 4 2012 5 2012 6 2012 7 2012 8 2012 9
3 id.1.00000 76.1 104.6 196.1 27.5 72.5 101.8 62.8 98.2 23.9
4 id.2.00000 38.1 41.4 75.8 45.4 102.2 124.8 68.9 81.1 38.9
Now we can use tidyr
, a companion package to dplyr
for reshaping data frames, to put it into the desired form.
> require(tidyr)
> df %>% gather(date, value, -`year 72`) %>% head
year 72 date value
1 id.1.00000 2012 1 76.1
2 id.2.00000 2012 1 38.1
3 id.3.00000 2012 1 88.6
4 id.4.00000 2012 1 132.5
5 id.5.00000 2012 1 241.9
From here, we can use separate
to break the "date" column into "year" and "month" columns.
Upvotes: 3
Reputation: 561
This should do the trick!
rownames(df) <- seq(length=nrow(df))
colnames(df)[2:13] <- c(1:12)
tmp <- melt(df[2:nrow(df),],id = c('year'))
colnames(tmp) <- c('id','Month','Value')
library(sqldf)
sqldf("Select * from tmp order by id")
Upvotes: 2