Ulli
Ulli

Reputation: 3

Generate summary statistics by subgroup with stargazer

I try to generate a summary statistics table for latex in R with stargazer. The table should contain the summary statistics sorted by three subgroups (Rwanda/Honduras/Nepal).

It worked out fine, when I did seperate tables only for the subgroup. I thought maybe the country variable is the problem.

The all_summary data frame looks like this:

structure(list(country = structure(c("Honduras", "Nepal", "Rwanda"
), label = "Country", format.stata = "%8s"), headGender = structure(c(0, 
1, 0), label = "head_gender", format.stata = "%9.0g"), femaleEduc = structure(c(1, 
2, 2), label = "female_educ", format.stata = "%9.0g"), maleEduc = structure(c(1, 
1, 2), label = "male_educ", format.stata = "%9.0g"), wVispeople = structure(c(0, 
1, 0), label = "w_visitpeople", format.stata = "%9.0g"), wVismarket = structure(c(0, 
1, 1), label = "w_vismarket", format.stata = "%9.0g"), wLeavevill = structure(c(0, 
1, 0), label = "w_leavevill", format.stata = "%9.0g"), fridge = structure(c(1, 
0, 0), label = "fridge_owned_desired", format.stata = "%9.0g"), 
    radio = structure(c(1, 1, 1), label = "radio_owned_desired", format.stata = "%9.0g"), 
    fan = structure(c(0, 0, 0), label = "fan_owned_desired", format.stata = "%9.0g"), 
    pc = structure(c(0, 0, 0), label = "pc_owned_desired", format.stata = "%9.0g"), 
    tv = structure(c(1, 0, 1), label = "tv_owned_desired", format.stata = "%9.0g"), 
    minutesSolid = structure(c(3, 2, 448), label = "stoveuseminutes_solids", format.stata = "%9.0g"), 
    minutesClean = structure(c(0, 0, 0), label = "stoveuseminutes_clean", format.stata = "%9.0g"), 
    stoveClean = structure(c(0, 0, 0), label = "stove_clean", format.stata = "%9.0g")), row.names = c(NA, 
-3L), class = c("tbl_df", "tbl", "data.frame"), label = "Written by R.")

This is how the code looks like:

  all_summary <- allcountries %>%
 select(Country, head_gender, female_educ, male_educ, w_visitpeople, w_vismarket, w_leavevill,    fridge_owned_desired, radio_owned_desired, fan_owned_desired, pc_owned_desired, tv_owned_desired, stoveuseminutes_solids, stoveuseminutes_clean, stove_clean) %>%
rename(country = Country,
     headGender = head_gender,
     femaleEduc = female_educ,
     maleEduc = male_educ,
     wVispeople = w_visitpeople,
     wVismarket = w_vismarket,
     wLeavevill = w_leavevill,
     fridge = fridge_owned_desired,
     radio = radio_owned_desired,
     fan = fan_owned_desired,
     pc = pc_owned_desired,
     tv = tv_owned_desired,
     minutesSolid = stoveuseminutes_solids,
     minutesClean = stoveuseminutes_clean,
     stoveClean = stove_clean)

#Group by country
all_summary_grouped <- all_summary %>%  group_by(country)

sumstats_all_grouped <-
   all_summary_grouped %>%
   summarise_each(funs(
   n = sum(!is.na(.)),
   min = min(., na.rm = TRUE),
   max = max(., na.rm = TRUE),
   mean = mean(., na.rm = TRUE)
   ))

#Reshape data
sumstatsA <- sumstats_all_grouped %>%
gather(stat, val) %>%
separate(stat, into = c ("var", "stat"), sep = "_") %>%
spread(stat, val)  %>%
select(var, n, min, max, mean)

 #Round
 sumstatsA = sumstatsA %>%
 mutate(mean = round(as.numeric(mean),2))

#produce table
stargazer(
sumstatsA, 
summary = F, 
type = "text", 
digits = 2, 
header = F , 
title = "Summary statistics for Honduras, Nepal and Rwanda", 
rownames = F, 
out = "Manuscript/Tables/SummaryAll_grouped.tex")

The error happens after the #Reshape data command section (Error in spread(): ! Each row of output must be identified by a unique combination of keys. Keys are shared for 171 rows:

Upvotes: 0

Views: 734

Answers (1)

Marco
Marco

Reputation: 2817

Here is some minimal raw data:

library(tidyverse)
data(mtcars)
as_tibble(mtcars)

# A tibble: 32 × 11
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1  21       6  160    110  3.9   2.62  16.5     0     1     4     4
 2  21       6  160    110  3.9   2.88  17.0     0     1     4     4
 3  22.8     4  108     93  3.85  2.32  18.6     1     1     4     1
 4  21.4     6  258    110  3.08  3.22  19.4     1     0     3     1
 5  18.7     8  360    175  3.15  3.44  17.0     0     0     3     2
 6  18.1     6  225    105  2.76  3.46  20.2     1     0     3     1
 7  14.3     8  360    245  3.21  3.57  15.8     0     0     3     4
 8  24.4     4  147.    62  3.69  3.19  20       1     0     4     2
 9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4     2
10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4     4
# … with 22 more rows
# ℹ Use `print(n = ...)` to see more rows

This is the gtsummary package that produces easy summary statistics per group (e.g. cyl) in default html format (you can have various modifications for the table):

library(gtsummary)
mtcars %>% tbl_summary(by = cyl)

enter image description here

There are options to print this as .tex:

# print as tex
as_kable_extra(mtcars %>% tbl_summary(by = cyl), format = "latex")

\begin{tabular}{l|c|c|c}
\hline
\textbf{Characteristic} & \textbf{4}, N = 11 & \textbf{6}, N = 7 & \textbf{8}, N = 14\\
\hline
mpg & 26.0 (22.8, 30.4) & 19.7 (18.6, 21.0) & 15.2 (14.4, 16.2)\\
\hline
disp & 108 (79, 121) & 168 (160, 196) & 350 (302, 390)\\
\hline
hp & 91 (66, 96) & 110 (110, 123) & 192 (176, 241)\\
\hline
drat & 4.08 (3.81, 4.16) & 3.90 (3.35, 3.91) & 3.12 (3.07, 3.22)\\
\hline
wt & 2.20 (1.89, 2.62) & 3.21 (2.82, 3.44) & 3.76 (3.53, 4.01)\\
\hline
qsec & 18.90 (18.56, 19.95) & 18.30 (16.74, 19.17) & 17.18 (16.10, 17.56)\\
\hline
vs & 10 (91\%) & 4 (57\%) & 0 (0\%)\\
\hline
am & 8 (73\%) & 3 (43\%) & 2 (14\%)\\
\hline
gear &  &  & \\
\hline
\hspace{1em}3 & 1 (9.1\%) & 2 (29\%) & 12 (86\%)\\
\hline
\hspace{1em}4 & 8 (73\%) & 4 (57\%) & 0 (0\%)\\
\hline
\hspace{1em}5 & 2 (18\%) & 1 (14\%) & 2 (14\%)\\
\hline
carb &  &  & \\
\hline
\hspace{1em}1 & 5 (45\%) & 2 (29\%) & 0 (0\%)\\
\hline
\hspace{1em}2 & 6 (55\%) & 0 (0\%) & 4 (29\%)\\
\hline
\hspace{1em}3 & 0 (0\%) & 0 (0\%) & 3 (21\%)\\
\hline
\hspace{1em}4 & 0 (0\%) & 4 (57\%) & 6 (43\%)\\
\hline
\hspace{1em}6 & 0 (0\%) & 1 (14\%) & 0 (0\%)\\
\hline
\hspace{1em}8 & 0 (0\%) & 0 (0\%) & 1 (7.1\%)\\
\hline
\multicolumn{4}{l}{\rule{0pt}{1em}\textsuperscript{1} Median (IQR); n (\%)}\\
\end{tabular}

Update

Here is a list of various statistics that you can specify via options. It now shows the number of non-missings, the mean, the median, p25 and p75 and the range from min to max.

mtcars %>% tbl_summary(by = am,
                       type = all_continuous() ~ "continuous2",
                       statistic = all_continuous() ~ c(
                         "{N_nonmiss}",
                         "{mean}",
                         "{median} ({p25}, {p75})",
                         "{min}, {max}"
                       ))

enter image description here

Upvotes: 0

Related Questions