Reputation: 326

Compute for column values based on conditional substrings in the column names

I have a dataframe with hundreads of columns. Just for example purposes I'm going to present a toy dataframe.

TPT_A_2 | TPT_B_2 | TPT_C_2 | TPT_A_4 | TPT_B_4 | TPT_C_4 | TPT_A_6 | TPT_B_6 | TPT_C_6 | 
 100        100       100       200       200      200       400       400        400

I want to compute the mean for those variables with the same initial substrings as name (TPT_A, TPT_B..) that end with 2 and 4. So I would get something like:

TPT_A_mean | TPT_B_mean | TPT_C_mean | TPT_A_6 | TPT_B_6 | TPT_C_6 | 
  150           150          150         400      400        400

This data would be:

row1 <- c("TPT_A_2", "TPT_B_2", "TPT_C_2","TPT_A_4", "TPT_B_4", "TPT_C_4", "TPT_A_6", "TPT_B_6", "TPT_C_6")
row2 <- c(100, 100, 100, 200, 200, 200, 400, 40, 400)   
data <- as.data.frame(rbind(row1, row2))
colnames(data) <- as.character(data[1,])
data <- data[-1,]

Upvotes: 4

Answers (3)

PaulS

Reputation: 25353

Another possible solution:

library(tidyverse)

row1 <- c("TPT_A_2", "TPT_B_2", "TPT_C_2","TPT_A_4", "TPT_B_4", "TPT_C_4", "TPT_A_6", "TPT_B_6", "TPT_C_6")
row2 <- c(100, 100, 100, 200, 200, 200, 400, 400, 400)   
data <- as.data.frame(rbind(row1, row2))
colnames(data) <- as.character(data[1,])
data <- data[-1,]

data %>% 
  pivot_longer(everything()) %>%
  mutate(value = as.numeric(value)) %>% 
  mutate(aux = if_else(str_detect(name, "2$|4$"), 1, 0),
         name1 = str_extract(name, "TPT_[A-Z]")) %>% 
  group_by(name1, aux) %>% 
  summarise(
    avg = if_else(aux == 0, value, mean(value)), 
    names2 = if_else(aux == 0, name, str_c(name1, "_mean")),
    .groups = "drop") %>% 
  distinct %>% select(-name1,-aux) %>%  
  pivot_wider(names_from = names2, values_from = avg) %>% 
  select(ends_with("mean"), matches("\\d$"))

#> # A tibble: 1 × 6
#>   TPT_A_mean TPT_B_mean TPT_C_mean TPT_A_6 TPT_B_6 TPT_C_6
#>        <dbl>      <dbl>      <dbl>   <dbl>   <dbl>   <dbl>
#> 1        150        150        150     400     400     400

Upvotes: 0

r2evans

Reputation: 160587

First, your method for generating a frame is an anti-pattern, resulting in your numbers being converted to strings.

str(dat)
# 'data.frame': 1 obs. of  9 variables:
#  $ TPT_A_2: chr "100"
#  $ TPT_B_2: chr "100"
#  $ TPT_C_2: chr "100"
#  $ TPT_A_4: chr "200"
#  $ TPT_B_4: chr "200"
#  $ TPT_C_4: chr "200"
#  $ TPT_A_6: chr "400"
#  $ TPT_B_6: chr "40"
#  $ TPT_C_6: chr "400"

In this case, we can use:

row1 <- c("TPT_A_2", "TPT_B_2", "TPT_C_2","TPT_A_4", "TPT_B_4", "TPT_C_4", "TPT_A_6", "TPT_B_6", "TPT_C_6")
row2 <- c(100, 100, 100, 200, 200, 200, 400, 40, 400)   
dat <- as.data.frame(setNames(as.list(row2),row1))
str(dat)
# 'data.frame': 1 obs. of  9 variables:
#  $ TPT_A_2: num 100
#  $ TPT_B_2: num 100
#  $ TPT_C_2: num 100
#  $ TPT_A_4: num 200
#  $ TPT_B_4: num 200
#  $ TPT_C_4: num 200
#  $ TPT_A_6: num 400
#  $ TPT_B_6: num 40
#  $ TPT_C_6: num 400

From here ...

base R

dat2a <- subset(dat, select = grepl("TPT_[ABC]_[24]", colnames(dat)))
dat2b <- subset(dat, select = !grepl("TPT_[ABC]_[24]", colnames(dat)))
cbind(
  dat2b, 
  lapply(split.default(dat2a, gsub("_[24]$", "", colnames(dat2a))),
         function(z) mean(unlist(z)))
)
#   TPT_A_6 TPT_B_6 TPT_C_6 TPT_A TPT_B TPT_C
# 1     400      40     400   150   150   150

dplyr

library(dplyr)
library(purrr) # imap
dat %>%
  split.default(., gsub("_[24]$", "",  colnames(.))) %>%
  imap(., function(x, nm)  {
    if (ncol(x) > 1) {
      setNames(data.frame(mean(unlist(x))), paste0(nm, "_mean"))
    } else x
  }) %>%
  bind_cols()
#   TPT_A_mean TPT_A_6 TPT_B_mean TPT_B_6 TPT_C_mean TPT_C_6
# 1        150     400        150      40        150     400

Upvotes: 5

DPH

Reputation: 4344

One option is use a pivot function from tidyr to make the data long and work from there within the tidyverse:

library(tidyverse) # some prefer to call only the needed packages instead of the whole tidyverse

data %>%
    # make the data long
    tidyr::pivot_longer(1:last_col()) %>% 
    # cut the before column names to desired length and check for 2 or 4 to paste mean else 6
    dplyr::mutate(grp = paste0(stringr::str_sub(name, 1, 5),
                               "_",
                               ifelse(stringr::str_detect(name, pattern = "2|4"), "mean", "6"))) %>% 
    # build groupings
    dplyr::group_by(grp) %>% 
    # caluclate mean
    dplyr::summarise(means = mean(as.numeric(value))) %>%
    # make table wide again
    tidyr::pivot_wider(names_from = "grp", values_from = "means")

# A tibble: 1 x 6
  TPT_A_6 TPT_A_mean TPT_B_6 TPT_B_mean TPT_C_6 TPT_C_mean
    <dbl>      <dbl>   <dbl>      <dbl>   <dbl>      <dbl>
1     400        150      40        150     400        150

There is a small typo in your sample data therefore column TPT_B_6 is 40 not 400

Upvotes: 4

Compute for column values based on conditional substrings in the column names

Answers (3)

base R

dplyr

Related Questions