Reputation: 3195
let's see on the result of iris classification. Little sample.
iris=structure(list(Sepal.Length = c(5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6,
5, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1,
5.4, 5.1, 4.6, 5.1, 4.8, 5, 5, 5.2, 5.2, 4.7, 4.8, 5.4, 5.2,
5.5, 4.9, 5, 5.5, 4.9, 4.4, 5.1, 5, 4.5, 4.4, 5, 5.1, 4.8, 5.1,
4.6, 5.3, 5), Sepal.Width = c(3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4,
3.4, 2.9, 3.1, 3.7, 3.4, 3, 3, 4, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4,
3.7, 3.6, 3.3, 3.4, 3, 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2,
3.1, 3.2, 3.5, 3.6, 3, 3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3, 3.8,
3.2, 3.7, 3.3), Petal.Length = c(1.4, 1.4, 1.3, 1.5, 1.4, 1.7,
1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7,
1.5, 1.7, 1.5, 1, 1.7, 1.9, 1.6, 1.6, 1.5, 1.4, 1.6, 1.6, 1.5,
1.5, 1.4, 1.5, 1.2, 1.3, 1.4, 1.3, 1.5, 1.3, 1.3, 1.3, 1.6, 1.9,
1.4, 1.6, 1.4, 1.5, 1.4), Petal.Width = c(0.2, 0.2, 0.2, 0.2,
0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.4, 0.4,
0.3, 0.3, 0.3, 0.2, 0.4, 0.2, 0.5, 0.2, 0.2, 0.4, 0.2, 0.2, 0.2,
0.2, 0.4, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1, 0.2, 0.2, 0.3, 0.3, 0.2,
0.6, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2), flower = c(1L, 3L, 3L, 3L,
1L, 2L, 3L, 1L, 3L, 3L, 2L, 1L, 3L, 3L, 2L, 2L, 2L, 1L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 3L, 1L, 2L, 2L, 3L, 1L,
2L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 2L, 3L, 2L, 3L, 2L, 1L)), class = "data.frame", row.names = c(NA,
-50L))
3 class of species.
But the question how get descriptive statistics(DS) for each parameter and for each class?
I use library(psych)
but the result of (DS) is not in a convenient format for me when I use the describeby
function. I need to get a ready-made table strictly in this format.
i provided it as.data.frame
structure(list(variable.name = c("Sepal,Length", "", "", "",
"Sepal,Width", "", "", "", "Petal,Length", "", "", "", "Petal,Width",
"", "", ""), number.of.flower.class = c("1", "2", "3", "total",
"1", "2", "3", "total", "1", "2", "3", "total", "1", "2", "3",
"total"), count.observations.in.class = c(20L, 13L, 17L, 50L,
20L, 13L, 17L, 50L, 20L, 13L, 17L, 50L, 20L, 13L, 17L, 50L),
Mean = c(5.04, 5.4, 4.6647, 5.006, 3.45, 3.8923, 3.0471,
3.428, 1.475, 1.5077, 1.4118, 1.462, 0.27, 0.2692, 0.2, 0.246
), stdev = c(0.1875, 0.23805, 0.21196, 0.35249, 0.11921,
0.23616, 0.22671, 0.37906, 0.1916, 0.18913, 0.13173, 0.17366,
0.12607, 0.10316, 0.06124, 0.10539), X.95.DI = c(4.9522,
5.2561, 4.5557, 4.9058, 3.3942, 3.7496, 2.9305, 3.3203, 1.3853,
1.3934, 1.344, 1.4126, 0.211, 0.2069, 0.1685, 0.216), X95.DI = c(5.1278,
5.5439, 4.7737, 5.1062, 3.5058, 4.035, 3.1636, 3.5357, 1.5647,
1.622, 1.4795, 1.5114, 0.329, 0.3316, 0.2315, 0.276), min = c(4.6,
5.1, 4.3, 4.3, 3.2, 3.5, 2.3, 2.3, 1, 1.2, 1.1, 1, 0.1, 0.1,
0.1, 0.1), max = c(5.4, 5.8, 5, 5.8, 3.7, 4.4, 3.4, 4.4,
1.9, 1.9, 1.6, 1.9, 0.6, 0.4, 0.3, 0.6)), class = "data.frame", row.names = c(NA,
-16L))
How do I restructure the descriptive statistics strictly into such a tabular format for each class? I'm interested in statistics: variable name
number of flower class
count observations in class
Mean
stdev
-95%DI
95%DI
min
max
Upvotes: 0
Views: 61
Reputation: 4140
I'm not sure if there is a succinct way to summarize by group AND summarize overall, so I calculated your metrics of interest on both and combine them.
library(dplyr)
library(tidyr)
data(iris)
myci <- function(x, dir){
se.x <- (sd(x) / sqrt(length(x)))
mean.x <- mean(x)
if(dir=="lower"){
return(mean.x - qt(1 - (0.05 / 2), length(x) - 1) * se.x)
} else if(dir=="upper"){
return(mean.x + qt(1 - (0.05 / 2), length(x) - 1) * se.x)
}
}
output <- bind_rows(
# Calculate group metrics
iris %>%
pivot_longer(setdiff(names(iris), "Species"), names_to = "variable") %>%
group_by(Species, variable) %>%
summarize_if(is.numeric, list(n=length,
mean=mean,
sd=sd,
min=min,
max=max,
ci_lower=function(x){myci(x, "lower")},
ci_upper=function(x){myci(x, "upper")})),
# Caculate overall metrics
iris %>%
pivot_longer(setdiff(names(iris), "Species"), names_to = "variable") %>%
group_by(variable) %>%
summarize_if(is.numeric, list(n=length,
mean=mean,
sd=sd,
min=min,
max=max,
ci_lower=function(x){myci(x, "lower")},
ci_upper=function(x){myci(x, "upper")})) %>%
mutate(Species = "Total"))
output[order(output$variable),]
#> # A tibble: 16 x 9
#> # Groups: Species [4]
#> Species variable n mean sd min max ci_lower ci_upper
#> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 setosa Petal.Length 50 1.46 0.174 1 1.9 1.41 1.51
#> 2 versicolor Petal.Length 50 4.26 0.470 3 5.1 4.13 4.39
#> 3 virginica Petal.Length 50 5.55 0.552 4.5 6.9 5.40 5.71
#> 4 Total Petal.Length 150 3.76 1.77 1 6.9 3.47 4.04
#> 5 setosa Petal.Width 50 0.246 0.105 0.1 0.6 0.216 0.276
#> 6 versicolor Petal.Width 50 1.33 0.198 1 1.8 1.27 1.38
#> 7 virginica Petal.Width 50 2.03 0.275 1.4 2.5 1.95 2.10
#> 8 Total Petal.Width 150 1.20 0.762 0.1 2.5 1.08 1.32
#> 9 setosa Sepal.Length 50 5.01 0.352 4.3 5.8 4.91 5.11
#> 10 versicolor Sepal.Length 50 5.94 0.516 4.9 7 5.79 6.08
#> 11 virginica Sepal.Length 50 6.59 0.636 4.9 7.9 6.41 6.77
#> 12 Total Sepal.Length 150 5.84 0.828 4.3 7.9 5.71 5.98
#> 13 setosa Sepal.Width 50 3.43 0.379 2.3 4.4 3.32 3.54
#> 14 versicolor Sepal.Width 50 2.77 0.314 2 3.4 2.68 2.86
#> 15 virginica Sepal.Width 50 2.97 0.322 2.2 3.8 2.88 3.07
#> 16 Total Sepal.Width 150 3.06 0.436 2 4.4 2.99 3.13
Created on 2021-12-16 by the reprex package (v2.0.1)
Upvotes: 1