vcat
vcat

Reputation: 263

New columns based on calculations of multiple existing columns in R

Original data frame in R df_orginal

AB    CD  EF  GH
A     0   12  M
A     0   13  F
A     0   14  F
A     1   16  M
A     1   17  F
A     1   18  M
A     1   18  M
A-2   0   78  M
A-2   0   12  M
A-2   0   12  F
A-2   1   12  F
A-2   1   47  M
A-2   1   34  F

Outputs I would like: df_1 = max, min, mean of EF based on AB The code I am attempting to use is:

df_1 <- df_original %>% group_by(AB, CD) df_1 <- summarise(EF, max, min, mean)

Error: Problem with summarize() input...1 x Input ...1 must be a vector, not a function Input ...1 is max

AB  CD  Max  Min  Mean 
A   0   14   12   13
A   1   18   16   17.25
A-2 0   78   12   34
A-2 1   47   12   31

And df_2 = total number of M or F from GH based on AB and CD:

df_2 <- df_original %>% group_by(AB, CD, GH) 

setDT(df_original)[,.(M_total = mean(EF)), by = CD
AB  CD  M_total  F_total
A   0   1        2
A   1   3        1
A-2 0   2        1
A-2 1   1        2

Upvotes: 2

Views: 88

Answers (6)

hello_friend
hello_friend

Reputation: 5788

Another tidyverse solution:

library(tidyverse)
df1 <- 
  df_original %>% 
  group_by(AB, CD) %>% 
  summarise(
    across(where(is.numeric),
    list(
      min = min, 
      max = max, 
      mean = mean, 
      median = median)
    )
  )

df2 <- 
  df_original %>% 
  count(GH, AB, CD) %>% 
  pivot_wider(
    id_cols = c(AB, CD), 
    names_from = GH, 
    values_from = n
  )

Upvotes: 0

akrun
akrun

Reputation: 887028

Using collapse

library(collapse)
collap(slt(df_original, AB, CD, EF), ~ AB + CD, list(fmax, fmin, fmean))
#   AB CD fmax.EF fmin.EF fmean.EF
#1   A  0      14      12    13.00
#2   A  1      18      16    17.25
#3 A-2  0      78      12    34.00
4# A-2  1      47      12    31.00

For the second case

library(magrittr)
df_original %>%
    fgroup_by(AB, CD) %>%
    fsummarise(M_total = fsum(GH == 'M'), F_total = fsum(GH == 'F'))
#   AB CD M_total F_total
#1   A  0       1       2
#2   A  1       3       1
#3 A-2  0       2       1
#4 A-2  1       1       2

data

df_original <- structure(list(AB = c("A", "A", "A", "A", "A", "A", "A", "A-2", 
"A-2", "A-2", "A-2", "A-2", "A-2"), CD = c(0L, 0L, 0L, 1L, 1L, 
1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L), EF = c(12L, 13L, 14L, 16L, 17L, 
18L, 18L, 78L, 12L, 12L, 12L, 47L, 34L), GH = c("M", "F", "F", 
"M", "F", "M", "M", "M", "M", "F", "F", "M", "F")),
class = "data.frame", row.names = c(NA, 
-13L))

Upvotes: 1

Justin K
Justin K

Reputation: 11

Part 1: The tidy way!

df_1 <- df_original %>% 
    group_by(AB, CD) %>%
    summarise(count = sum(EF),
              EF_max = max(EF),
              EF_min = min(EF),
              EF_mean = mean(EF))

Part 2: The tidy way!* Although I am not entirely clear about what you want precisely.

df_2 <- df_original %>% 
    group_by(AB, CD, GH) %>%
    tally()

Upvotes: 0

Gregor Thomas
Gregor Thomas

Reputation: 145765

If you use across in dplyr you can provide a list of functions:

df_1 <- df_original %>% 
  group_by(AB, CD) %>%
  summarise(across(EF, .fns = list(max, min, mean)))

For df_2, with only two level's Matt Kaye's solution writing them out directly is expedient.

Upvotes: 3

Matt Kaye
Matt Kaye

Reputation: 520

And a dplyr solution!

df_1 <- df_original %>% 
  group_by(AB, CD) %>% 
  summarize(
    max = max(EF), 
    min = min(EF),
    mean = mean(EF)
  )

and #2

df_2 <- df_original %>% 
  group_by(AB, CD, GH) %>%
  summarize(
    M_total = sum(GH == 'M'),
    F_total = sum(GH == 'F')
  )

Upvotes: 3

ThomasIsCoding
ThomasIsCoding

Reputation: 101189

Here is a data.table option

setDT(df)
df_1 <- df[, .(Max = max(EF), Min = min(EF), Mean = mean(EF)), .(AB, CD)]
df_2 <- dcast(df, AB + CD ~ paste0(GH, "_Total"))

and you will see

> df_1
    AB CD Max Min  Mean
1:   A  0  14  12 13.00
2:   A  1  18  16 17.25
3: A-2  0  78  12 34.00
4: A-2  1  47  12 31.00

> df_2
    AB CD F_Total M_Total
1:   A  0       2       1
2:   A  1       1       3
3: A-2  0       1       2
4: A-2  1       2       1

Upvotes: 2

Related Questions