coding
coding

Reputation: 1207

Sum different values from categorical data?

I have a df with this structure:

state sex population deaths   year

CAL    F    5           3      1980
CAL    M    4           1      1980
TX     F    10          6      2001
TX     M    16          5      2001

I would like to sum the female (F) population and Male (M) population and deaths and have a row like this:

state sex population deaths year  

CAL    F    5           3   1980
CAL    M    4           1   1980
CAL    T     9          4   1980 
TX     F    10          6   2001 
TX     M    16          5   2001
TX     T    26          11  2001

Where T is the sum of female and Male Data

Upvotes: 0

Views: 81

Answers (2)

akrun
akrun

Reputation: 887158

We can group_split by 'state', loop over the list with map, use adorn_totals from janitor to get the sum of numeric columns as a new row and collapse the list to a single dataset with _dfr as suffix in map

library(dplyr)
library(janitor)
library(purrr)
df1 %>% 
    group_split(state) %>%
    map_dfr(~ .x %>% adorn_totals(name = first(.x$state),
         fill = "T"))

-output

#   state sex population deaths
#   CAL   F          5      3
#   CAL   M          4      1
#   CAL   T          9      4
#    TX   F         10      6
#    TX   M         16      5
#    TX   T         26     11

Or do a group by summarise and bind the summarised output with the original

df1 %>%
  group_by(state) %>% 
  summarise(sex = "T", across(c(population, deaths), sum),
         .groups = 'drop') %>% 
  bind_rows(df1, .) %>% 
  arrange(state)

Update

Based on the updated data

df2 %>%
   group_split(state) %>%
   map_dfr(~ .x %>% 
              adorn_totals(name = first(.x$state), fill = "T") %>%
   mutate(year = first(year)))
# state sex population deaths year
#   CAL   F          5      3 1980
#   CAL   M          4      1 1980
#   CAL   T          9      4 1980
#    TX   F         10      6 2001
#    TX   M         16      5 2001
#    TX   T         26     11 2001

library(tidyr)
df2 %>% 
    group_by(state) %>% 
    summarise(sex = "T", across(c(population, deaths), sum),
          .groups = 'drop') %>% 
   bind_rows(df2, .) %>% 
   arrange(state) %>% 
   fill(year)
#  state sex population deaths year
#1   CAL   F          5      3 1980
#2   CAL   M          4      1 1980
#3   CAL   T          9      4 1980
#4    TX   F         10      6 2001
#5    TX   M         16      5 2001
#6    TX   T         26     11 2001

data

df1 <- structure(list(state = c("CAL", "CAL", "TX", "TX"), sex = c("F", 
"M", "F", "M"), population = c(5L, 4L, 10L, 16L), deaths = c(3L, 
1L, 6L, 5L)), class = "data.frame", row.names = c(NA, -4L))


df2 <- structure(list(state = c("CAL", "CAL", "TX", "TX"), sex = c("F", 
"M", "F", "M"), population = c(5L, 4L, 10L, 16L), deaths = c(3L, 
1L, 6L, 5L), year = c(1980L, 1980L, 2001L, 2001L)),
class = "data.frame", row.names = c(NA, 
-4L))

Upvotes: 0

ThomasIsCoding
ThomasIsCoding

Reputation: 101538

A data.table option

setDT(df)[,rbind(.SD,c(.(sex = "T"),colSums(.SD[,-1]))),state,.SDcols = c("sex", "popularion", "deaths")]

gives

   state sex population deaths
1:   CAL   F          5      3
2:   CAL   M          4      1
3:   CAL   T          9      4
4:    TX   F         10      6
5:    TX   M         16      5
6:    TX   T         26     11

Data

> dput(df)
structure(list(state = c("CAL", "CAL", "TX", "TX"), sex = c("F",
"M", "F", "M"), population = c(5L, 4L, 10L, 16L), deaths = c(3L,
1L, 6L, 5L)), class = "data.frame", row.names = c(NA, -4L))

Upvotes: 0

Related Questions