Reputation: 13
I have a dataframe, I would like to calculate all the mean values of x and all the sum of y group by the first row of the dateframe.
The following link is the result I want. The result expected
Here are the data.
dt=structure(list(year = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("1980",
"1981", "1982", "1985", "group"), class = "factor"), x1 = structure(c(4L,
1L, 3L, 2L, 1L), .Label = c("1", "2", "4", "A"), class = "factor"),
y1 = structure(c(4L, 1L, 3L, 2L, 2L), .Label = c("1", "3",
"5", "A"), class = "factor"), x2 = structure(c(5L, 1L, 4L,
3L, 2L), .Label = c("2", "4", "5", "6", "A"), class = "factor"),
y2 = structure(c(4L, 1L, 3L, 3L, 2L), .Label = c("3", "5",
"7", "A"), class = "factor"), x3 = structure(c(4L, 1L, 3L,
2L, 1L), .Label = c("4", "6", "8", "B"), class = "factor"),
y3 = structure(c(4L, 1L, 3L, 2L, 1L), .Label = c("3", "5",
"6", "B"), class = "factor"), x4 = structure(c(4L, 1L, 3L,
2L, 3L), .Label = c("2", "4", "5", "C"), class = "factor"),
y4 = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("3", "4",
"5", "6", "C"), class = "factor"), x5 = structure(c(5L, 2L,
1L, 3L, 4L), .Label = c("3", "4", "6", "7", "C"), class = "factor"),
y5 = structure(c(4L, 2L, 1L, 3L, 2L), .Label = c("2", "5",
"8", "C"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
And result expected,
result_expected <- structure(list(year = c(1980L, 1981L, 1982L, 1985L), A_x_mean = c(1.5,
5, 3.5, 2.5), A_y_sum = c(4L, 12L, 10L, 8L), B_x_mean = c(4L,
8L, 6L, 4L), B_y_sum = c(3L, 6L, 5L, 3L), C_x_mean = 3:6, C_y_sum = c(8L,
6L, 13L, 11L)), class = "data.frame", row.names = c(NA, -4L))
I have search key words in goole and stackoverflow, but no proper answers. My current thinking is to calculate unique group A,B,C in first row.
require(tidyverse)
group_variables <- dt%>%gather(key,value)%>%distinct(value)%>%arrange(value)
then get the row in group_variables
by the for
for i in group_variables{......}
or can I change the structure of the dataframe by gathe
and spread
in tidyr
,and by dplyr
method, something just like following code,
dt_new%>% group_by (group)%>%
summarise(mean=mean(x,na.rm=TRUE),
sum=sum(x,na.rm=TURE))
Upvotes: 1
Views: 229
Reputation: 46908
First we need to take out the first row having the group, make the data frame long, simplify x1,x2,x3 to x etc and put the groups back:
group_var = sapply(dt[1,-1],as.character)
mat <-
dt[-1,] %>% pivot_longer(-year) %>%
mutate(value=as.numeric(as.character(value))) %>%
mutate(group=as.character(group_var[as.character(name)])) %>%
mutate(name=substr(name,1,1))
mat
# A tibble: 40 x 4
year name value group
<fct> <chr> <dbl> <chr>
1 1980 x 1 A
2 1980 y 1 A
3 1980 x 2 A
4 1980 y 3 A
5 1980 x 4 B
6 1980 y 3 B
7 1980 x 2 C
8 1980 y 3 C
9 1980 x 4 C
10 1980 y 5 C
Now what's left is to group them according to year, name and group and do the respective function, so we define a function:
func = function(DF,func){
DF %>%
group_by(group,name,year) %>%
summarise_all(func) %>%
mutate(label=paste(group,name,func,sep="_")) %>%
ungroup %>%
select(year,value,label) %>%
pivot_wider(values_from=value,names_from=label)
}
And we apply it over two parts of the data:
cbind(func(mat %>% filter(name=="x"),"mean"),func(mat %>% filter(name=="y"),"sum"))
year A_x_mean B_x_mean C_x_mean year A_y_sum B_y_sum C_y_sum
1 1980 1.5 4 3 1980 4 3 8
2 1981 5.0 8 4 1981 12 6 6
3 1982 3.5 6 5 1982 10 5 13
4 1985 2.5 4 6 1985 8 3 11
Upvotes: 1
Reputation: 4370
One way would be to make your factors into characters, then make your first row your column names(and remove the first row). Then I did some data manipulation using dplyr
and tidyr
to make the data long by year and letters and then transposed the data into wide format after taking the sum and the mean.
dt=structure(list(year = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("1980",
"1981", "1982", "1985", "group"), class = "factor"), x1 = structure(c(4L,
1L, 3L, 2L, 1L), .Label = c("1", "2", "4", "A"), class = "factor"),
y1 = structure(c(4L, 1L, 3L, 2L, 2L), .Label = c("1", "3",
"5", "A"), class = "factor"), x2 = structure(c(5L, 1L, 4L,
3L, 2L), .Label = c("2", "4", "5", "6", "A"), class = "factor"),
y2 = structure(c(4L, 1L, 3L, 3L, 2L), .Label = c("3", "5",
"7", "A"), class = "factor"), x3 = structure(c(4L, 1L, 3L,
2L, 1L), .Label = c("4", "6", "8", "B"), class = "factor"),
y3 = structure(c(4L, 1L, 3L, 2L, 1L), .Label = c("3", "5",
"6", "B"), class = "factor"), x4 = structure(c(4L, 1L, 3L,
2L, 3L), .Label = c("2", "4", "5", "C"), class = "factor"),
y4 = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("3", "4",
"5", "6", "C"), class = "factor"), x5 = structure(c(5L, 2L,
1L, 3L, 4L), .Label = c("3", "4", "6", "7", "C"), class = "factor"),
y5 = structure(c(4L, 2L, 1L, 3L, 2L), .Label = c("2", "5",
"8", "C"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
dt[sapply(dt, is.factor)] <- sapply(dt, as.character)
colnames(dt) <- dt[1,]
dt2 <- dt[-1,]
library(tidyverse)
dt3 <- pivot_longer(dt2, cols = c("A","B","C"),
names_to = "letters") %>%
ungroup %>%
select(-.copy) %>%
ungroup %>%
mutate(value = as.numeric(value)) %>%
group_by(letters,group) %>%
summarize(meanval = mean(value),
sumval = sum(value)) %>%
ungroup %>%
pivot_wider(names_from = letters,
values_from = c(meanval,sumval))
Upvotes: 0