Reputation: 326
I have a dataframe with hundreads of columns. Just for example purposes I'm going to present a toy dataframe.
TPT_A_2 | TPT_B_2 | TPT_C_2 | TPT_A_4 | TPT_B_4 | TPT_C_4 | TPT_A_6 | TPT_B_6 | TPT_C_6 |
100 100 100 200 200 200 400 400 400
I want to compute the mean for those variables with the same initial substrings as name (TPT_A, TPT_B..) that end with 2 and 4. So I would get something like:
TPT_A_mean | TPT_B_mean | TPT_C_mean | TPT_A_6 | TPT_B_6 | TPT_C_6 |
150 150 150 400 400 400
This data would be:
row1 <- c("TPT_A_2", "TPT_B_2", "TPT_C_2","TPT_A_4", "TPT_B_4", "TPT_C_4", "TPT_A_6", "TPT_B_6", "TPT_C_6")
row2 <- c(100, 100, 100, 200, 200, 200, 400, 40, 400)
data <- as.data.frame(rbind(row1, row2))
colnames(data) <- as.character(data[1,])
data <- data[-1,]
Upvotes: 4
Views: 81
Reputation: 25353
Another possible solution:
library(tidyverse)
row1 <- c("TPT_A_2", "TPT_B_2", "TPT_C_2","TPT_A_4", "TPT_B_4", "TPT_C_4", "TPT_A_6", "TPT_B_6", "TPT_C_6")
row2 <- c(100, 100, 100, 200, 200, 200, 400, 400, 400)
data <- as.data.frame(rbind(row1, row2))
colnames(data) <- as.character(data[1,])
data <- data[-1,]
data %>%
pivot_longer(everything()) %>%
mutate(value = as.numeric(value)) %>%
mutate(aux = if_else(str_detect(name, "2$|4$"), 1, 0),
name1 = str_extract(name, "TPT_[A-Z]")) %>%
group_by(name1, aux) %>%
summarise(
avg = if_else(aux == 0, value, mean(value)),
names2 = if_else(aux == 0, name, str_c(name1, "_mean")),
.groups = "drop") %>%
distinct %>% select(-name1,-aux) %>%
pivot_wider(names_from = names2, values_from = avg) %>%
select(ends_with("mean"), matches("\\d$"))
#> # A tibble: 1 × 6
#> TPT_A_mean TPT_B_mean TPT_C_mean TPT_A_6 TPT_B_6 TPT_C_6
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 150 150 150 400 400 400
Upvotes: 0
Reputation: 160587
First, your method for generating a frame is an anti-pattern, resulting in your numbers being converted to strings.
str(dat)
# 'data.frame': 1 obs. of 9 variables:
# $ TPT_A_2: chr "100"
# $ TPT_B_2: chr "100"
# $ TPT_C_2: chr "100"
# $ TPT_A_4: chr "200"
# $ TPT_B_4: chr "200"
# $ TPT_C_4: chr "200"
# $ TPT_A_6: chr "400"
# $ TPT_B_6: chr "40"
# $ TPT_C_6: chr "400"
In this case, we can use:
row1 <- c("TPT_A_2", "TPT_B_2", "TPT_C_2","TPT_A_4", "TPT_B_4", "TPT_C_4", "TPT_A_6", "TPT_B_6", "TPT_C_6")
row2 <- c(100, 100, 100, 200, 200, 200, 400, 40, 400)
dat <- as.data.frame(setNames(as.list(row2),row1))
str(dat)
# 'data.frame': 1 obs. of 9 variables:
# $ TPT_A_2: num 100
# $ TPT_B_2: num 100
# $ TPT_C_2: num 100
# $ TPT_A_4: num 200
# $ TPT_B_4: num 200
# $ TPT_C_4: num 200
# $ TPT_A_6: num 400
# $ TPT_B_6: num 40
# $ TPT_C_6: num 400
From here ...
dat2a <- subset(dat, select = grepl("TPT_[ABC]_[24]", colnames(dat)))
dat2b <- subset(dat, select = !grepl("TPT_[ABC]_[24]", colnames(dat)))
cbind(
dat2b,
lapply(split.default(dat2a, gsub("_[24]$", "", colnames(dat2a))),
function(z) mean(unlist(z)))
)
# TPT_A_6 TPT_B_6 TPT_C_6 TPT_A TPT_B TPT_C
# 1 400 40 400 150 150 150
library(dplyr)
library(purrr) # imap
dat %>%
split.default(., gsub("_[24]$", "", colnames(.))) %>%
imap(., function(x, nm) {
if (ncol(x) > 1) {
setNames(data.frame(mean(unlist(x))), paste0(nm, "_mean"))
} else x
}) %>%
bind_cols()
# TPT_A_mean TPT_A_6 TPT_B_mean TPT_B_6 TPT_C_mean TPT_C_6
# 1 150 400 150 40 150 400
Upvotes: 5
Reputation: 4344
One option is use a pivot function from tidyr
to make the data long and work from there within the tidyverse
:
library(tidyverse) # some prefer to call only the needed packages instead of the whole tidyverse
data %>%
# make the data long
tidyr::pivot_longer(1:last_col()) %>%
# cut the before column names to desired length and check for 2 or 4 to paste mean else 6
dplyr::mutate(grp = paste0(stringr::str_sub(name, 1, 5),
"_",
ifelse(stringr::str_detect(name, pattern = "2|4"), "mean", "6"))) %>%
# build groupings
dplyr::group_by(grp) %>%
# caluclate mean
dplyr::summarise(means = mean(as.numeric(value))) %>%
# make table wide again
tidyr::pivot_wider(names_from = "grp", values_from = "means")
# A tibble: 1 x 6
TPT_A_6 TPT_A_mean TPT_B_6 TPT_B_mean TPT_C_6 TPT_C_mean
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 400 150 40 150 400 150
There is a small typo in your sample data therefore column TPT_B_6 is 40 not 400
Upvotes: 4